CycleDiffusion with Stable Diffusion
@@ -370,9 +364,11 @@ def replace_nsfw_images(results):
2. Click the "Run CycleDiffusion" button.
- """)
+ """
+ )
with gr.Accordion("See Details", open=False):
- gr.HTML("""
+ gr.HTML(
+ """
How to use:
@@ -396,14 +392,14 @@ def replace_nsfw_images(results):
1. 20s on A10G.
- """)
+ """
+ )
with gr.Row():
with gr.Column(scale=55):
with gr.Group():
- img = gr.Image(
- label="Input image", height=512, tool="editor", type="pil")
+ img = gr.Image(label="Input image", height=512, tool="editor", type="pil")
image_out = gr.Image(label="Output image", height=512)
# gallery = gr.Gallery(
@@ -422,7 +418,8 @@ def replace_nsfw_images(results):
label="Source guidance scale",
value=1,
minimum=1,
- maximum=10, )
+ maximum=10,
+ )
with gr.Row():
target_prompt = gr.Textbox(
label="Target prompt",
@@ -432,14 +429,16 @@ def replace_nsfw_images(results):
label="Target guidance scale",
value=5,
minimum=1,
- maximum=10, )
+ maximum=10,
+ )
with gr.Row():
strength = gr.Slider(
label="Strength",
value=0.7,
minimum=0.5,
maximum=1,
- step=0.01, )
+ step=0.01,
+ )
with gr.Row():
generate1 = gr.Button(value="Run CycleDiffusion")
@@ -449,7 +448,8 @@ def replace_nsfw_images(results):
cross_attention_control = gr.Radio(
label="CAC type",
choices=["None", "Replace", "Refine"],
- value="None", )
+ value="None",
+ )
with gr.Row():
# If not "None", the following two parameters will be used.
cross_replace_steps = gr.Slider(
@@ -457,13 +457,15 @@ def replace_nsfw_images(results):
value=0.8,
minimum=0.0,
maximum=1,
- step=0.01, )
+ step=0.01,
+ )
self_replace_steps = gr.Slider(
label="Self replace steps",
value=0.4,
minimum=0.0,
maximum=1,
- step=0.01, )
+ step=0.01,
+ )
with gr.Row():
generate2 = gr.Button(value="Run CycleDiffusion")
@@ -475,23 +477,13 @@ def replace_nsfw_images(results):
value=100,
minimum=25,
maximum=500,
- step=1, )
- width = gr.Slider(
- label="Width",
- value=512,
- minimum=512,
- maximum=1024,
- step=8)
- height = gr.Slider(
- label="Height",
- value=512,
- minimum=512,
- maximum=1024,
- step=8)
+ step=1,
+ )
+ width = gr.Slider(label="Width", value=512, minimum=512, maximum=1024, step=8)
+ height = gr.Slider(label="Height", value=512, minimum=512, maximum=1024, step=8)
with gr.Row():
- seed = gr.Slider(
- 0, 2147483647, label="Seed", value=0, step=1)
+ seed = gr.Slider(0, 2147483647, label="Seed", value=0, step=1)
with gr.Row():
generate3 = gr.Button(value="Run CycleDiffusion")
@@ -714,11 +706,14 @@ def replace_nsfw_images(results):
],
image_out,
inference,
- cache_examples=True, )
+ cache_examples=True,
+ )
- gr.Markdown("""
+ gr.Markdown(
+ """
Space built with PPDiffusers 🧨 by PaddleNLP.
[![Twitter Follow](https://img.shields.io/twitter/follow/ChenHenryWu?style=social)](https://twitter.com/ChenHenryWu)
- """)
+ """
+ )
demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=8581)
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
index d09df121e1427..15df9ac4402ff 100644
--- a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
+++ b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
@@ -22,13 +22,9 @@
def register_attention_control(model, controller):
def ca_forward(self, place_in_unet):
- def forward(hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs):
+ def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = self.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = self.to_q(hidden_states)
query = self.head_to_batch_dim(query)
@@ -41,11 +37,9 @@ def forward(hidden_states,
key = self.head_to_batch_dim(key)
value = self.head_to_batch_dim(value)
- attention_probs = self.get_attention_scores(query, key,
- attention_mask)
+ attention_probs = self.get_attention_scores(query, key, attention_mask)
- attention_probs = controller(attention_probs, is_cross,
- place_in_unet)
+ attention_probs = controller(attention_probs, is_cross, place_in_unet)
hidden_states = paddle.matmul(attention_probs, value)
hidden_states = self.batch_to_head_dim(hidden_states)
@@ -82,17 +76,12 @@ def register_recr(net_, count, place_in_unet):
def get_word_inds(text: str, word_place: int, tokenizer):
split_text = text.split(" ")
if type(word_place) is str:
- word_place = [
- i for i, word in enumerate(split_text) if word_place == word
- ]
+ word_place = [i for i, word in enumerate(split_text) if word_place == word]
elif type(word_place) is int:
word_place = [word_place]
out = []
if len(word_place) > 0:
- words_encode = [
- tokenizer.decode([item]).strip("#")
- for item in tokenizer.encode(text).input_ids
- ][1:-1]
+ words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
cur_len, ptr = 0, 0
for i in range(len(words_encode)):
@@ -106,14 +95,14 @@ def get_word_inds(text: str, word_place: int, tokenizer):
def update_alpha_time_word(
- alpha,
- bounds: Union[float, Tuple[float, float]],
- prompt_ind: int,
- word_inds: Optional[paddle.Tensor]=None, ):
+ alpha,
+ bounds: Union[float, Tuple[float, float]],
+ prompt_ind: int,
+ word_inds: Optional[paddle.Tensor] = None,
+):
if type(bounds) is float or bounds == 0:
bounds = 0, bounds
- start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] *
- alpha.shape[0])
+ start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
if word_inds is None:
word_inds = paddle.arange(alpha.shape[2])
alpha[:start, prompt_ind, word_inds] = 0
@@ -123,32 +112,26 @@ def update_alpha_time_word(
def get_time_words_attention_alpha(
- prompts,
- num_steps,
- cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[
- float, float]]],
- tokenizer,
- max_num_words=77, ):
+ prompts,
+ num_steps,
+ cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+ tokenizer,
+ max_num_words=77,
+):
if type(cross_replace_steps) is not dict:
cross_replace_steps = {"default_": cross_replace_steps}
if "default_" not in cross_replace_steps:
cross_replace_steps["default_"] = (0.0, 1.0)
- alpha_time_words = paddle.zeros(
- [num_steps + 1, len(prompts) - 1, max_num_words])
+ alpha_time_words = paddle.zeros([num_steps + 1, len(prompts) - 1, max_num_words])
for i in range(len(prompts) - 1):
- alpha_time_words = update_alpha_time_word(
- alpha_time_words, cross_replace_steps["default_"], i)
+ alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i)
for key, item in cross_replace_steps.items():
if key != "default_":
- inds = [
- get_word_inds(prompts[i], key, tokenizer)
- for i in range(1, len(prompts))
- ]
+ inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
for i, ind in enumerate(inds):
if len(ind) > 0:
- alpha_time_words = update_alpha_time_word(alpha_time_words,
- item, i, ind)
+ alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
alpha_time_words = alpha_time_words.reshape(
- [num_steps + 1, len(prompts) - 1, 1, 1,
- max_num_words]) # time, batch, heads, pixels, words
+ [num_steps + 1, len(prompts) - 1, 1, 1, max_num_words]
+ ) # time, batch, heads, pixels, words
return alpha_time_words
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
index e1b1bc7bb6ccf..24c30b91e7f7d 100644
--- a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
+++ b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
@@ -66,8 +66,7 @@ def global_align(x, y, score):
for j in range(1, len(y) + 1):
left = matrix[i, j - 1] + score.gap
up = matrix[i - 1, j] + score.gap
- diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1],
- y[j - 1])
+ diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
matrix[i, j] = max(left, up, diag)
if matrix[i, j] == left:
trace_back[i, j] = 1
@@ -112,14 +111,20 @@ def get_mapper(x: str, y: str, tokenizer, max_len=77):
score = ScoreParams(0, 1, -1)
matrix, trace_back = global_align(x_seq, y_seq, score)
mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
- alphas = paddle.ones([max_len, ])
- alphas[:mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32")
+ alphas = paddle.ones(
+ [
+ max_len,
+ ]
+ )
+ alphas[: mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32")
mapper = paddle.zeros(
- [max_len, ],
- dtype=paddle.int64, )
- mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
- mapper[mapper_base.shape[0]:] = len(y_seq) + paddle.arange(
- max_len - len(y_seq), dtype="int64")
+ [
+ max_len,
+ ],
+ dtype=paddle.int64,
+ )
+ mapper[: mapper_base.shape[0]] = mapper_base[:, 1]
+ mapper[mapper_base.shape[0] :] = len(y_seq) + paddle.arange(max_len - len(y_seq), dtype="int64")
return mapper, alphas
@@ -136,17 +141,12 @@ def get_refinement_mapper(prompts, tokenizer, max_len=77):
def get_word_inds(text: str, word_place: int, tokenizer):
split_text = text.split(" ")
if type(word_place) is str:
- word_place = [
- i for i, word in enumerate(split_text) if word_place == word
- ]
+ word_place = [i for i, word in enumerate(split_text) if word_place == word]
elif type(word_place) is int:
word_place = [word_place]
out = []
if len(word_place) > 0:
- words_encode = [
- tokenizer.decode([item]).strip("#")
- for item in tokenizer.encode(text).input_ids
- ][1:-1]
+ words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
cur_len, ptr = 0, 0
for i in range(len(words_encode)):
@@ -175,8 +175,7 @@ def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
cur_inds = 0
while i < max_len and j < max_len:
if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
- inds_source_, inds_target_ = inds_source[cur_inds], inds_target[
- cur_inds]
+ inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
if len(inds_source_) == len(inds_target_):
mapper[inds_source_, inds_target_] = 1
else:
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
index 81a81d63cc039..e086453002714 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
@@ -45,13 +45,10 @@ def __init__(self, paths, size=None, random_crop=False, labels=None):
if self.size is not None and self.size > 0:
self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
if not self.random_crop:
- self.cropper = albumentations.CenterCrop(
- height=self.size, width=self.size)
+ self.cropper = albumentations.CenterCrop(height=self.size, width=self.size)
else:
- self.cropper = albumentations.RandomCrop(
- height=self.size, width=self.size)
- self.preprocessor = albumentations.Compose(
- [self.rescaler, self.cropper])
+ self.cropper = albumentations.RandomCrop(height=self.size, width=self.size)
+ self.preprocessor = albumentations.Compose([self.rescaler, self.cropper])
else:
self.preprocessor = lambda **kwargs: kwargs
@@ -102,12 +99,7 @@ def __init__(self, cause, keys=None, visited=None):
super().__init__(message)
-def retrieve(list_or_dict,
- key,
- splitval="/",
- default=None,
- expand=True,
- pass_success=False):
+def retrieve(list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False):
"""Given a nested list or dict return the desired value at key expanding
callable nodes if necessary and :attr:`expand` is ``True``. The expansion
is done in-place.
@@ -150,11 +142,10 @@ def retrieve(list_or_dict,
if callable(list_or_dict):
if not expand:
raise KeyNotFoundError(
- ValueError(
- "Trying to get past callable node with expand=False."
- ),
+ ValueError("Trying to get past callable node with expand=False."),
keys=keys,
- visited=visited, )
+ visited=visited,
+ )
list_or_dict = list_or_dict()
parent[last_key] = list_or_dict
@@ -187,23 +178,19 @@ def retrieve(list_or_dict,
return list_or_dict, success
-def give_synsets_from_indices(indices,
- path_to_yaml="data/imagenet_idx_to_synset.yaml"):
+def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"):
synsets = []
with open(path_to_yaml) as f:
di2s = yaml.load(f)
for idx in indices:
synsets.append(str(di2s[idx]))
- print("Using {} different synsets for construction of Restriced Imagenet.".
- format(len(synsets)))
+ print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets)))
return synsets
def str_to_indices(string):
"""Expects a string in the format '32-123, 256, 280-321'"""
- assert not string.endswith(
- ","), "provided string '{}' ends with a comma, pls remove it".format(
- string)
+ assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string)
subs = string.split(",")
indices = []
for sub in subs:
@@ -236,8 +223,7 @@ def __init__(self, config=None):
self.config = config
if not type(self.config) == dict:
self.config = {}
- self.keep_orig_class_label = self.config.get("keep_orig_class_label",
- False)
+ self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
self.process_images = True # if False we skip loading & processing images and self.data contains filepaths
self._prepare()
self._prepare_synset_to_human()
@@ -255,14 +241,15 @@ def _prepare(self):
raise NotImplementedError()
def _filter_relpaths(self, relpaths):
- ignore = set(["n06596364_9591.JPEG", ])
- relpaths = [
- rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore
- ]
+ ignore = set(
+ [
+ "n06596364_9591.JPEG",
+ ]
+ )
+ relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
if "sub_indices" in self.config:
indices = str_to_indices(self.config["sub_indices"])
- synsets = give_synsets_from_indices(
- indices, path_to_yaml=self.idx2syn) # returns a list of strings
+ synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
files = []
for rpath in relpaths:
@@ -277,8 +264,7 @@ def _prepare_synset_to_human(self):
SIZE = 2655750
URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
self.human_dict = os.path.join(self.root, "synset_human.txt")
- if (not os.path.exists(self.human_dict) or
- not os.path.getsize(self.human_dict) == SIZE):
+ if not os.path.exists(self.human_dict) or not os.path.getsize(self.human_dict) == SIZE:
download(URL, self.human_dict)
def _prepare_idx_to_synset(self):
@@ -289,8 +275,7 @@ def _prepare_idx_to_synset(self):
def _prepare_human_to_integer_label(self):
URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
- self.human2integer = os.path.join(self.root,
- "imagenet1000_clsidx_to_labels.txt")
+ self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
if not os.path.exists(self.human2integer):
download(URL, self.human2integer)
with open(self.human2integer, "r") as f:
@@ -306,15 +291,13 @@ def _load(self):
self.relpaths = f.read().splitlines()
l1 = len(self.relpaths)
self.relpaths = self._filter_relpaths(self.relpaths)
- print("Removed {} files from filelist during filtering.".format(
- l1 - len(self.relpaths)))
+ print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
self.synsets = [p.split("/")[0] for p in self.relpaths]
self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
unique_synsets = np.unique(self.synsets)
- class_dict = dict((synset, i)
- for i, synset in enumerate(unique_synsets))
+ class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
if not self.keep_orig_class_label:
self.class_labels = [class_dict[s] for s in self.synsets]
else:
@@ -339,7 +322,8 @@ def _load(self):
self.abspaths,
labels=labels,
size=self.size,
- random_crop=self.random_crop, )
+ random_crop=self.random_crop,
+ )
else:
self.data = self.abspaths
@@ -348,8 +332,12 @@ class ImageNetTrain(ImageNetBase):
NAME = "ILSVRC2012_train"
URL = "http://www.image-net.org/challenges/LSVRC/2012/"
AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
- FILES = ["ILSVRC2012_img_train.tar", ]
- SIZES = [147897477120, ]
+ FILES = [
+ "ILSVRC2012_img_train.tar",
+ ]
+ SIZES = [
+ 147897477120,
+ ]
def __init__(self, process_images=True, data_root=None, **kwargs):
self.process_images = process_images
@@ -360,15 +348,13 @@ def _prepare(self):
if self.data_root:
self.root = os.path.join(self.data_root, self.NAME)
else:
- cachedir = os.environ.get("XDG_CACHE_HOME",
- os.path.expanduser("~/.cache"))
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
self.datadir = os.path.join(self.root, "data")
self.txt_filelist = os.path.join(self.root, "filelist.txt")
self.expected_length = 1281167
- self.random_crop = retrieve(
- self.config, "ImageNetTrain/random_crop", default=True)
+ self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True)
if not is_prepared(self.root):
# prep
print("Preparing dataset {} in {}".format(self.NAME, self.root))
@@ -376,8 +362,7 @@ def _prepare(self):
datadir = self.datadir
if not os.path.exists(datadir):
path = os.path.join(self.root, self.FILES[0])
- if (not os.path.exists(path) or
- not os.path.getsize(path) == self.SIZES[0]):
+ if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
import academictorrents as at
atpath = at.get(self.AT_HASH, datastore=self.root)
@@ -391,7 +376,7 @@ def _prepare(self):
print("Extracting sub-tars.")
subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
for subpath in tqdm(subpaths):
- subdir = subpath[:-len(".tar")]
+ subdir = subpath[: -len(".tar")]
os.makedirs(subdir, exist_ok=True)
with tarfile.open(subpath, "r:") as tar:
tar.extractall(path=subdir)
@@ -429,14 +414,12 @@ def _prepare(self):
if self.data_root:
self.root = os.path.join(self.data_root, self.NAME)
else:
- cachedir = os.environ.get("XDG_CACHE_HOME",
- os.path.expanduser("~/.cache"))
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
self.datadir = os.path.join(self.root, "data")
self.txt_filelist = os.path.join(self.root, "filelist.txt")
self.expected_length = 50000
- self.random_crop = retrieve(
- self.config, "ImageNetValidation/random_crop", default=False)
+ self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False)
if not is_prepared(self.root):
# prep
print("Preparing dataset {} in {}".format(self.NAME, self.root))
@@ -444,8 +427,7 @@ def _prepare(self):
datadir = self.datadir
if not os.path.exists(datadir):
path = os.path.join(self.root, self.FILES[0])
- if (not os.path.exists(path) or
- not os.path.getsize(path) == self.SIZES[0]):
+ if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
import academictorrents as at
atpath = at.get(self.AT_HASH, datastore=self.root)
@@ -457,8 +439,7 @@ def _prepare(self):
tar.extractall(path=datadir)
vspath = os.path.join(self.root, self.FILES[1])
- if (not os.path.exists(vspath) or
- not os.path.getsize(vspath) == self.SIZES[1]):
+ if not os.path.exists(vspath) or not os.path.getsize(vspath) == self.SIZES[1]:
download(self.VS_URL, vspath)
with open(vspath, "r") as f:
@@ -486,14 +467,15 @@ def _prepare(self):
class ImageNetSR(Dataset):
def __init__(
- self,
- size=None,
- degradation=None,
- downscale_f=4,
- min_crop_f=0.5,
- max_crop_f=1.0,
- random_crop=True,
- output_LR_image=False, ):
+ self,
+ size=None,
+ degradation=None,
+ downscale_f=4,
+ min_crop_f=0.5,
+ max_crop_f=1.0,
+ random_crop=True,
+ output_LR_image=False,
+ ):
"""
Imagenet Superresolution Dataloader
Performs following ops in order:
@@ -522,30 +504,22 @@ def __init__(
assert max_crop_f <= 1.0
self.center_crop = not random_crop
- self.image_rescaler = albumentations.SmallestMaxSize(
- max_size=size, interpolation=cv2.INTER_AREA)
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
- self.pil_interpolation = (
- False # gets reset later if incase interp_op is from pillow
- )
+ self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
if degradation == "bsrgan":
- self.degradation_process = partial(
- degradation_fn_bsr, sf=downscale_f)
+ self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
elif degradation == "bsrgan_light":
- self.degradation_process = partial(
- degradation_fn_bsr_light, sf=downscale_f)
+ self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
else:
self.pil_interpolation = degradation.startswith("pil_")
if self.pil_interpolation:
interpolation_fn = degradation.replace("pil_", "")
- self.degradation_process = partial(
- TF.resize,
- size=self.LR_size,
- interpolation=interpolation_fn)
+ self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
else:
interpolation_fn = {
"cv_nearest": cv2.INTER_NEAREST,
@@ -555,7 +529,8 @@ def __init__(
"cv_lanczos": cv2.INTER_LANCZOS4,
}[degradation]
self.degradation_process = albumentations.SmallestMaxSize(
- max_size=self.LR_size, interpolation=interpolation_fn)
+ max_size=self.LR_size, interpolation=interpolation_fn
+ )
def __len__(self):
return len(self.base)
@@ -570,17 +545,14 @@ def __getitem__(self, i):
image = np.array(image).astype(np.uint8)
min_side_len = min(image.shape[:2])
- crop_side_len = min_side_len * np.random.uniform(
- self.min_crop_f, self.max_crop_f, size=None)
+ crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
crop_side_len = int(crop_side_len)
if self.center_crop:
- self.cropper = albumentations.CenterCrop(
- height=crop_side_len, width=crop_side_len)
+ self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
else:
- self.cropper = albumentations.RandomCrop(
- height=crop_side_len, width=crop_side_len)
+ self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
image = self.cropper(image=image)["image"]
image = self.image_rescaler(image=image)["image"]
@@ -592,11 +564,9 @@ def __getitem__(self, i):
LR_image = np.array(LR_image).astype(np.uint8)
else:
LR_image = self.degradation_process(image=image)["image"]
- example["LR_image"] = ((
- LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1]))
+ example["LR_image"] = (LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
- example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose(
- [2, 0, 1])
+ example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
return example
@@ -608,7 +578,9 @@ def __init__(self, **kwargs):
def get_base(self):
with open("data/imagenet_train_hr_indices.p", "rb") as f:
indices = pickle.load(f)
- dset = ImageNetTrain(process_images=False, )
+ dset = ImageNetTrain(
+ process_images=False,
+ )
return Subset(dset, indices)
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
index 37224cba9a9d9..890a4eea89241 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
@@ -13,5 +13,4 @@
# limitations under the License.
from .bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from .bsrgan_light import \
- degradation_bsrgan_variant as degradation_fn_bsr_light
+from .bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
index a50493c2591ea..1efdbaa95c8ca 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
@@ -51,7 +51,7 @@ def modcrop_np(img, sf):
"""
w, h = img.shape[:2]
im = np.copy(img)
- return im[:w - w % sf, :h - h % sf, ...]
+ return im[: w - w % sf, : h - h % sf, ...]
"""
@@ -69,7 +69,7 @@ def analytic_kernel(k):
# Loop over the small kernel to fill the big one
for r in range(k_size):
for c in range(k_size):
- big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+ big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
crop = k_size // 2
cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -90,9 +90,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
"""
v = np.dot(
- np.array(
- [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
- np.array([1.0, 0.0]), )
+ np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
+ np.array([1.0, 0.0]),
+ )
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
D = np.array([[l1, 0], [0, l2]])
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -161,11 +161,12 @@ def blur(x, k):
def gen_kernel(
- k_size=np.array([15, 15]),
- scale_factor=np.array([4, 4]),
- min_var=0.6,
- max_var=10.0,
- noise_level=0, ):
+ k_size=np.array([15, 15]),
+ scale_factor=np.array([4, 4]),
+ min_var=0.6,
+ max_var=10.0,
+ noise_level=0,
+):
""" "
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
# Kai Zhang
@@ -180,14 +181,12 @@ def gen_kernel(
# Set COV matrix using Lambdas and Theta
LAMBDA = np.diag([lambda_1, lambda_2])
- Q = np.array(
- [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
- SIGMA = Q @LAMBDA @Q.T
+ Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+ SIGMA = Q @ LAMBDA @ Q.T
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
# Set expectation position (shifting kernel for aligned image)
- MU = k_size // 2 - 0.5 * (scale_factor - 1
- ) # - 0.5 * (scale_factor - k_size % 2)
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
MU = MU[None, None, :, None]
# Create meshgrid for Gaussian
@@ -197,7 +196,7 @@ def gen_kernel(
# Calcualte Gaussian for every pixel of the kernel
ZZ = Z - MU
ZZ_t = ZZ.transpose(0, 1, 3, 2)
- raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise)
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
# shift the kernel so it will be centered
# raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
@@ -212,8 +211,7 @@ def fspecial_gaussian(hsize, sigma):
hsize = [hsize, hsize]
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
std = sigma
- [x, y] = np.meshgrid(
- np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
arg = -(x * x + y * y) / (2 * std * std)
h = np.exp(arg)
h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -279,9 +277,7 @@ def srmd_degradation(x, k, sf=3):
year={2018}
}
"""
- x = ndimage.filters.convolve(
- x, np.expand_dims(
- k, axis=2), mode="wrap") # 'nearest' | 'mirror'
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap") # 'nearest' | 'mirror'
x = bicubic_degradation(x, sf=sf)
return x
@@ -359,13 +355,11 @@ def add_blur(img, sf=4):
ksize=2 * random.randint(2, 11) + 3,
theta=random.random() * np.pi,
l1=l1,
- l2=l2, )
+ l2=l2,
+ )
else:
- k = fspecial("gaussian", 2 * random.randint(2, 11) + 3,
- wd * random.random())
- img = ndimage.filters.convolve(
- img, np.expand_dims(
- k, axis=2), mode="mirror")
+ k = fspecial("gaussian", 2 * random.randint(2, 11) + 3, wd * random.random())
+ img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
return img
@@ -381,7 +375,8 @@ def add_resize(img, sf=4):
img = cv2.resize(
img,
(int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
img = np.clip(img, 0.0, 1.0)
return img
@@ -391,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
noise_level = random.randint(noise_level1, noise_level2)
rnum = np.random.rand()
if rnum > 0.6: # add color Gaussian noise
- img = img + np.random.normal(0, noise_level / 255.0,
- img.shape).astype(np.float32)
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
elif rnum < 0.4: # add grayscale Gaussian noise
- img = img + np.random.normal(0, noise_level / 255.0,
- (*img.shape[:2], 1)).astype(np.float32)
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
else: # add noise
L = noise_level2 / 255.0
D = np.diag(np.random.rand(3))
U = orth(np.random.rand(3, 3))
conv = np.dot(np.dot(np.transpose(U), D), U)
- img = img + np.random.multivariate_normal(
- [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
img = np.clip(img, 0.0, 1.0)
return img
@@ -412,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
img = np.clip(img, 0.0, 1.0)
rnum = random.random()
if rnum > 0.6:
- img += img * np.random.normal(0, noise_level / 255.0,
- img.shape).astype(np.float32)
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
elif rnum < 0.4:
- img += img * np.random.normal(0, noise_level / 255.0,
- (*img.shape[:2], 1)).astype(np.float32)
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
else:
L = noise_level2 / 255.0
D = np.diag(np.random.rand(3))
U = orth(np.random.rand(3, 3))
conv = np.dot(np.dot(np.transpose(U), D), U)
- img += img * np.random.multivariate_normal(
- [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
img = np.clip(img, 0.0, 1.0)
return img
def add_Poisson_noise(img):
img = np.clip((img * 255.0).round(), 0, 255) / 255.0
- vals = 10**(2 * random.random() + 2.0) # [2, 4]
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
if random.random() < 0.5:
img = np.random.poisson(img * vals).astype(np.float32) / vals
else:
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
- noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) /
- vals - img_gray)
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
img += noise_gray[:, :, np.newaxis]
img = np.clip(img, 0.0, 1.0)
return img
@@ -446,8 +434,7 @@ def add_Poisson_noise(img):
def add_JPEG_noise(img):
quality_factor = random.randint(30, 95)
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
- result, encimg = cv2.imencode(
- ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+ result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
img = cv2.imdecode(encimg, 1)
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
return img
@@ -457,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
h, w = lq.shape[:2]
rnd_h = random.randint(0, h - lq_patchsize)
rnd_w = random.randint(0, w - lq_patchsize)
- lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+ lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
- hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize
- * sf, :]
+ hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
return lq, hq
@@ -482,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
sf_ori = sf
h1, w1 = img.shape[:2]
- img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
+ img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
h, w = img.shape[:2]
if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -495,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
img = util.imresize_np(img, 1 / 2, True)
img = np.clip(img, 0.0, 1.0)
@@ -506,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
if idx1 > idx2: # keep downsample3 last
shuffle_order[idx1], shuffle_order[idx2] = (
shuffle_order[idx2],
- shuffle_order[idx1], )
+ shuffle_order[idx1],
+ )
for i in shuffle_order:
@@ -524,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
k_shifted = shift_pixel(k, sf)
- k_shifted = k_shifted / k_shifted.sum(
- ) # blur with shifted kernel
- img = ndimage.filters.convolve(
- img, np.expand_dims(
- k_shifted, axis=2), mode="mirror")
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
+ img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
img = img[0::sf, 0::sf, ...] # nearest downsampling
img = np.clip(img, 0.0, 1.0)
@@ -541,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / sf * a), int(1 / sf * b)),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
img = np.clip(img, 0.0, 1.0)
elif i == 4:
@@ -585,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
_, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
h1, w1 = image.shape[:2]
- image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
+ image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
h, w = image.shape[:2]
if sf == 4 and random.random() < scale2_prob: # downsample1
@@ -593,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
image = cv2.resize(
image,
(int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
image = util.imresize_np(image, 1 / 2, True)
image = np.clip(image, 0.0, 1.0)
@@ -604,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
if idx1 > idx2: # keep downsample3 last
shuffle_order[idx1], shuffle_order[idx2] = (
shuffle_order[idx2],
- shuffle_order[idx1], )
+ shuffle_order[idx1],
+ )
for i in shuffle_order:
@@ -621,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
sf1 = random.uniform(1, 2 * sf)
image = cv2.resize(
image,
- (int(1 / sf1 * image.shape[1]),
- int(1 / sf1 * image.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
k_shifted = shift_pixel(k, sf)
- k_shifted = k_shifted / k_shifted.sum(
- ) # blur with shifted kernel
- image = ndimage.filters.convolve(
- image, np.expand_dims(
- k_shifted, axis=2), mode="mirror")
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
+ image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
image = image[0::sf, 0::sf, ...] # nearest downsampling
image = np.clip(image, 0.0, 1.0)
@@ -640,7 +626,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
image = cv2.resize(
image,
(int(1 / sf * a), int(1 / sf * b)),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
image = np.clip(image, 0.0, 1.0)
elif i == 4:
@@ -673,19 +660,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
img_lq = deg_fn(img)["image"]
img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
print(img_lq)
- img_lq_bicubic = albumentations.SmallestMaxSize(
- max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
+ "image"
+ ]
print(img_lq.shape)
print("bicubic", img_lq_bicubic.shape)
print(img_hq.shape)
lq_nearest = cv2.resize(
util.single2uint(img_lq),
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
- interpolation=0, )
+ interpolation=0,
+ )
lq_bicubic_nearest = cv2.resize(
util.single2uint(img_lq_bicubic),
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
- interpolation=0, )
- img_concat = np.concatenate(
- [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+ interpolation=0,
+ )
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
index 86127e21d672e..94a515d93d914 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
@@ -29,6 +29,7 @@
from scipy.linalg import orth
from . import utils_image as util
+
"""
# --------------------------------------------
# Super-Resolution
@@ -51,7 +52,7 @@ def modcrop_np(img, sf):
"""
w, h = img.shape[:2]
im = np.copy(img)
- return im[:w - w % sf, :h - h % sf, ...]
+ return im[: w - w % sf, : h - h % sf, ...]
"""
@@ -69,7 +70,7 @@ def analytic_kernel(k):
# Loop over the small kernel to fill the big one
for r in range(k_size):
for c in range(k_size):
- big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+ big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
crop = k_size // 2
cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -90,9 +91,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
"""
v = np.dot(
- np.array(
- [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
- np.array([1.0, 0.0]), )
+ np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
+ np.array([1.0, 0.0]),
+ )
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
D = np.array([[l1, 0], [0, l2]])
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -161,11 +162,12 @@ def blur(x, k):
def gen_kernel(
- k_size=np.array([15, 15]),
- scale_factor=np.array([4, 4]),
- min_var=0.6,
- max_var=10.0,
- noise_level=0, ):
+ k_size=np.array([15, 15]),
+ scale_factor=np.array([4, 4]),
+ min_var=0.6,
+ max_var=10.0,
+ noise_level=0,
+):
""" "
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
# Kai Zhang
@@ -180,14 +182,12 @@ def gen_kernel(
# Set COV matrix using Lambdas and Theta
LAMBDA = np.diag([lambda_1, lambda_2])
- Q = np.array(
- [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
- SIGMA = Q @LAMBDA @Q.T
+ Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+ SIGMA = Q @ LAMBDA @ Q.T
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
# Set expectation position (shifting kernel for aligned image)
- MU = k_size // 2 - 0.5 * (scale_factor - 1
- ) # - 0.5 * (scale_factor - k_size % 2)
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
MU = MU[None, None, :, None]
# Create meshgrid for Gaussian
@@ -197,7 +197,7 @@ def gen_kernel(
# Calcualte Gaussian for every pixel of the kernel
ZZ = Z - MU
ZZ_t = ZZ.transpose(0, 1, 3, 2)
- raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise)
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
# shift the kernel so it will be centered
# raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
@@ -212,8 +212,7 @@ def fspecial_gaussian(hsize, sigma):
hsize = [hsize, hsize]
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
std = sigma
- [x, y] = np.meshgrid(
- np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
arg = -(x * x + y * y) / (2 * std * std)
h = np.exp(arg)
h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -279,9 +278,7 @@ def srmd_degradation(x, k, sf=3):
year={2018}
}
"""
- x = ndimage.filters.convolve(
- x, np.expand_dims(
- k, axis=2), mode="wrap") # 'nearest' | 'mirror'
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap") # 'nearest' | 'mirror'
x = bicubic_degradation(x, sf=sf)
return x
@@ -359,16 +356,10 @@ def add_blur(img, sf=4):
if random.random() < 0.5:
l1 = wd2 * random.random()
l2 = wd2 * random.random()
- k = anisotropic_Gaussian(
- ksize=random.randint(2, 11) + 3,
- theta=random.random() * np.pi,
- l1=l1,
- l2=l2)
+ k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
else:
k = fspecial("gaussian", random.randint(2, 4) + 3, wd * random.random())
- img = ndimage.filters.convolve(
- img, np.expand_dims(
- k, axis=2), mode="mirror")
+ img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
return img
@@ -384,7 +375,8 @@ def add_resize(img, sf=4):
img = cv2.resize(
img,
(int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
img = np.clip(img, 0.0, 1.0)
return img
@@ -394,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
noise_level = random.randint(noise_level1, noise_level2)
rnum = np.random.rand()
if rnum > 0.6: # add color Gaussian noise
- img = img + np.random.normal(0, noise_level / 255.0,
- img.shape).astype(np.float32)
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
elif rnum < 0.4: # add grayscale Gaussian noise
- img = img + np.random.normal(0, noise_level / 255.0,
- (*img.shape[:2], 1)).astype(np.float32)
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
else: # add noise
L = noise_level2 / 255.0
D = np.diag(np.random.rand(3))
U = orth(np.random.rand(3, 3))
conv = np.dot(np.dot(np.transpose(U), D), U)
- img = img + np.random.multivariate_normal(
- [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
img = np.clip(img, 0.0, 1.0)
return img
@@ -415,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
img = np.clip(img, 0.0, 1.0)
rnum = random.random()
if rnum > 0.6:
- img += img * np.random.normal(0, noise_level / 255.0,
- img.shape).astype(np.float32)
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
elif rnum < 0.4:
- img += img * np.random.normal(0, noise_level / 255.0,
- (*img.shape[:2], 1)).astype(np.float32)
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
else:
L = noise_level2 / 255.0
D = np.diag(np.random.rand(3))
U = orth(np.random.rand(3, 3))
conv = np.dot(np.dot(np.transpose(U), D), U)
- img += img * np.random.multivariate_normal(
- [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
img = np.clip(img, 0.0, 1.0)
return img
def add_Poisson_noise(img):
img = np.clip((img * 255.0).round(), 0, 255) / 255.0
- vals = 10**(2 * random.random() + 2.0) # [2, 4]
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
if random.random() < 0.5:
img = np.random.poisson(img * vals).astype(np.float32) / vals
else:
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
- noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) /
- vals - img_gray)
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
img += noise_gray[:, :, np.newaxis]
img = np.clip(img, 0.0, 1.0)
return img
@@ -449,8 +434,7 @@ def add_Poisson_noise(img):
def add_JPEG_noise(img):
quality_factor = random.randint(80, 95)
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
- result, encimg = cv2.imencode(
- ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+ result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
img = cv2.imdecode(encimg, 1)
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
return img
@@ -460,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
h, w = lq.shape[:2]
rnd_h = random.randint(0, h - lq_patchsize)
rnd_w = random.randint(0, w - lq_patchsize)
- lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+ lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
- hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize
- * sf, :]
+ hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
return lq, hq
@@ -485,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
sf_ori = sf
h1, w1 = img.shape[:2]
- img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
+ img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
h, w = img.shape[:2]
if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -498,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
img = util.imresize_np(img, 1 / 2, True)
img = np.clip(img, 0.0, 1.0)
@@ -509,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
if idx1 > idx2: # keep downsample3 last
shuffle_order[idx1], shuffle_order[idx2] = (
shuffle_order[idx2],
- shuffle_order[idx1], )
+ shuffle_order[idx1],
+ )
for i in shuffle_order:
@@ -527,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
k_shifted = shift_pixel(k, sf)
- k_shifted = k_shifted / k_shifted.sum(
- ) # blur with shifted kernel
- img = ndimage.filters.convolve(
- img, np.expand_dims(
- k_shifted, axis=2), mode="mirror")
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
+ img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
img = img[0::sf, 0::sf, ...] # nearest downsampling
img = np.clip(img, 0.0, 1.0)
@@ -544,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
img = cv2.resize(
img,
(int(1 / sf * a), int(1 / sf * b)),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
img = np.clip(img, 0.0, 1.0)
elif i == 4:
@@ -588,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
_, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
h1, w1 = image.shape[:2]
- image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
+ image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
h, w = image.shape[:2]
if sf == 4 and random.random() < scale2_prob: # downsample1
@@ -596,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
image = cv2.resize(
image,
(int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
image = util.imresize_np(image, 1 / 2, True)
image = np.clip(image, 0.0, 1.0)
@@ -607,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
if idx1 > idx2: # keep downsample3 last
shuffle_order[idx1], shuffle_order[idx2] = (
shuffle_order[idx2],
- shuffle_order[idx1], )
+ shuffle_order[idx1],
+ )
for i in shuffle_order:
@@ -624,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
sf1 = random.uniform(1, 2 * sf)
image = cv2.resize(
image,
- (int(1 / sf1 * image.shape[1]),
- int(1 / sf1 * image.shape[0])),
- interpolation=random.choice([1, 2, 3]), )
+ (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+ interpolation=random.choice([1, 2, 3]),
+ )
else:
k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
k_shifted = shift_pixel(k, sf)
- k_shifted = k_shifted / k_shifted.sum(
- ) # blur with shifted kernel
- image = ndimage.filters.convolve(
- image, np.expand_dims(
- k_shifted, axis=2), mode="mirror")
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
+ image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
image = image[0::sf, 0::sf, ...] # nearest downsampling
image = np.clip(image, 0.0, 1.0)
@@ -644,7 +627,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
image = cv2.resize(
image,
(int(1 / sf * a), int(1 / sf * b)),
- interpolation=random.choice([1, 2, 3]), )
+ interpolation=random.choice([1, 2, 3]),
+ )
image = np.clip(image, 0.0, 1.0)
elif i == 4:
@@ -677,19 +661,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
img_lq = deg_fn(img)["image"]
img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
print(img_lq)
- img_lq_bicubic = albumentations.SmallestMaxSize(
- max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
+ "image"
+ ]
print(img_lq.shape)
print("bicubic", img_lq_bicubic.shape)
print(img_hq.shape)
lq_nearest = cv2.resize(
util.single2uint(img_lq),
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
- interpolation=0, )
+ interpolation=0,
+ )
lq_bicubic_nearest = cv2.resize(
util.single2uint(img_lq_bicubic),
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
- interpolation=0, )
- img_concat = np.concatenate(
- [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+ interpolation=0,
+ )
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
index 1e21fe66a10b6..be3bdaa3321cc 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
@@ -71,14 +71,12 @@ def cubic(x):
absx = paddle.abs(x)
absx2 = absx**2
absx3 = absx**3
- return (1.5 * absx3 - 2.5 * absx2 + 1) * (
- (absx <= 1).astype(absx.dtype)) + (
- -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2) * ((
- (absx > 1) * (absx <= 2)).astype(absx.dtype))
+ return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).astype(absx.dtype)) + (
+ -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+ ) * (((absx > 1) * (absx <= 2)).astype(absx.dtype))
-def calculate_weights_indices(in_length, out_length, scale, kernel,
- kernel_width, antialiasing):
+def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
if (scale < 1) and (antialiasing):
# Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
kernel_width = kernel_width / scale
@@ -102,14 +100,13 @@ def calculate_weights_indices(in_length, out_length, scale, kernel,
# The indices of the input pixels involved in computing the k-th output
# pixel are in row k of the indices matrix.
- indices = left.reshape([out_length, 1]).expand(
- [out_length, P]) + paddle.linspace(0, P - 1, P).reshape([1, P]).expand(
- [out_length, P])
+ indices = left.reshape([out_length, 1]).expand([out_length, P]) + paddle.linspace(0, P - 1, P).reshape(
+ [1, P]
+ ).expand([out_length, P])
# The weights used to compute the k-th output pixel are in row k of the
# weights matrix.
- distance_to_center = u.reshape([out_length, 1]).expand(
- [out_length, P]) - indices
+ distance_to_center = u.reshape([out_length, 1]).expand([out_length, P]) - indices
# apply cubic kernel
if (scale < 1) and (antialiasing):
weights = scale * cubic(distance_to_center * scale)
@@ -158,13 +155,15 @@ def imresize_np(img, scale, antialiasing=True):
# get weights and indices
weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
- in_H, out_H, scale, kernel, kernel_width, antialiasing)
+ in_H, out_H, scale, kernel, kernel_width, antialiasing
+ )
weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
- in_W, out_W, scale, kernel, kernel_width, antialiasing)
+ in_W, out_W, scale, kernel, kernel_width, antialiasing
+ )
# process H dimension
# symmetric copying
img_aug = paddle.zeros([in_H + sym_len_Hs + sym_len_He, in_W, in_C])
- img_aug[sym_len_Hs:sym_len_Hs + in_H] = img
+ img_aug[sym_len_Hs : sym_len_Hs + in_H] = img
sym_patch = img[:sym_len_Hs, :, :]
inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
@@ -174,20 +173,19 @@ def imresize_np(img, scale, antialiasing=True):
sym_patch = img[-sym_len_He:, :, :]
inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
sym_patch_inv = sym_patch.index_select(inv_idx, axis=0)
- img_aug[sym_len_Hs + in_H:sym_len_Hs + in_H + sym_len_He] = sym_patch_inv
+ img_aug[sym_len_Hs + in_H : sym_len_Hs + in_H + sym_len_He] = sym_patch_inv
out_1 = paddle.zeros([out_H, in_W, in_C])
kernel_width = weights_H.shape[1]
for i in range(out_H):
idx = int(indices_H[i][0])
for j in range(out_C):
- out_1[i, :, j] = (img_aug[idx:idx + kernel_width, :, j]
- .transpose([1, 0]).mv(weights_H[i]))
+ out_1[i, :, j] = img_aug[idx : idx + kernel_width, :, j].transpose([1, 0]).mv(weights_H[i])
# process W dimension
# symmetric copying
out_1_aug = paddle.zeros([out_H, in_W + sym_len_Ws + sym_len_We, in_C])
- out_1_aug[:, sym_len_Ws:sym_len_Ws + in_W] = out_1
+ out_1_aug[:, sym_len_Ws : sym_len_Ws + in_W] = out_1
sym_patch = out_1[:, :sym_len_Ws, :]
inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
@@ -197,16 +195,14 @@ def imresize_np(img, scale, antialiasing=True):
sym_patch = out_1[:, -sym_len_We:, :]
inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
sym_patch_inv = sym_patch.index_select(inv_idx, axis=1)
- out_1_aug[:, sym_len_Ws + in_W:sym_len_Ws + in_W +
- sym_len_We] = sym_patch_inv
+ out_1_aug[:, sym_len_Ws + in_W : sym_len_Ws + in_W + sym_len_We] = sym_patch_inv
out_2 = paddle.zeros([out_H, out_W, in_C])
kernel_width = weights_W.shape[1]
for i in range(out_W):
idx = int(indices_W[i][0])
for j in range(out_C):
- out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(
- weights_W[i])
+ out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(weights_W[i])
if need_squeeze:
out_2 = out_2.squeeze()
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
index 3d8311776fdb3..a1d4f642125ae 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
@@ -22,47 +22,36 @@
import paddle.nn.functional as F
from paddle.utils.download import get_weights_path_from_url
-from ppdiffusers.initializer import (constant_, normal_,
- reset_initialized_parameter)
+from ppdiffusers.initializer import constant_, normal_, reset_initialized_parameter
model_urls = {
"vgg16": (
"https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/lpips_vgg16.pdparams",
- "a1583475db9e49334735f2866847ae41", ),
+ "a1583475db9e49334735f2866847ae41",
+ ),
"vgg_netlin": (
"https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/vgg_netlin.pdparams",
- "f3ae85f16a1a243e789606ae0c4a59a1", ),
+ "f3ae85f16a1a243e789606ae0c4a59a1",
+ ),
}
class ActNorm(nn.Layer):
- def __init__(self,
- num_features,
- logdet=False,
- affine=True,
- allow_reverse_init=False):
+ def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False):
assert affine
super().__init__()
self.logdet = logdet
- self.loc = self.create_parameter(
- (1, num_features, 1, 1),
- default_initializer=nn.initializer.Constant(0))
- self.scale = self.create_parameter(
- (1, num_features, 1, 1),
- default_initializer=nn.initializer.Constant(1))
+ self.loc = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(0))
+ self.scale = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(1))
self.allow_reverse_init = allow_reverse_init
- self.register_buffer(
- "initialized", paddle.to_tensor(
- 0, dtype=paddle.int64))
+ self.register_buffer("initialized", paddle.to_tensor(0, dtype=paddle.int64))
@paddle.no_grad()
def initialize(self, input):
flatten = input.transpose([1, 0, 2, 3]).reshape([input.shape[1], -1])
- mean = (flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3)
- .transpose([1, 0, 2, 3]))
- std = (flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3)
- .transpose([1, 0, 2, 3]))
+ mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
+ std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
self.loc.set_value(-mean)
self.scale.set_value(1 / (std + 1e-6))
@@ -80,9 +69,7 @@ def forward(self, input, reverse=False):
if self.training and self.initialized.item() == 0:
self.initialize(input)
- self.initialized.set_value(
- paddle.to_tensor(
- 1, dtype=self.initialized.dtype))
+ self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
h = self.scale * (input + self.loc)
@@ -106,9 +93,7 @@ def reverse(self, output):
)
else:
self.initialize(output)
- self.initialized.set_value(
- paddle.to_tensor(
- 1, dtype=self.initialized.dtype))
+ self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
if len(output.shape) == 2:
output = output[:, :, None, None]
@@ -137,8 +122,7 @@ def hinge_d_loss(logits_real, logits_fake):
def vanilla_d_loss(logits_real, logits_fake):
- d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) +
- paddle.mean(F.softplus(logits_fake)))
+ d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) + paddle.mean(F.softplus(logits_fake)))
return d_loss
@@ -170,8 +154,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
norm_layer = nn.BatchNorm2D
else:
norm_layer = ActNorm
- if (type(norm_layer) == functools.
- partial): # no need to use bias as BatchNorm2d has affine parameters
+ if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
use_bias = norm_layer.func != nn.BatchNorm2D
else:
use_bias = norm_layer != nn.BatchNorm2D
@@ -179,8 +162,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
kw = 4
padw = 1
sequence = [
- nn.Conv2D(
- input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+ nn.Conv2D(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
nn.LeakyReLU(0.2),
]
nf_mult = 1
@@ -195,7 +177,8 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
kernel_size=kw,
stride=2,
padding=padw,
- bias_attr=use_bias, ),
+ bias_attr=use_bias,
+ ),
norm_layer(ndf * nf_mult),
nn.LeakyReLU(0.2),
]
@@ -209,14 +192,14 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
kernel_size=kw,
stride=1,
padding=padw,
- bias_attr=use_bias, ),
+ bias_attr=use_bias,
+ ),
norm_layer(ndf * nf_mult),
nn.LeakyReLU(0.2),
]
sequence += [
- nn.Conv2D(
- ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
+ nn.Conv2D(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
] # output 1 channel prediction map
self.main = nn.Sequential(*sequence)
@@ -229,10 +212,8 @@ def spatial_average(in_tens, keepdim=True):
return in_tens.mean([2, 3], keepdim=keepdim)
-def upsample(in_tens,
- out_HW=(64, 64)): # assumes scale factor is same for H and W
- return nn.Upsample(
- size=out_HW, mode="bilinear", align_corners=False)(in_tens)
+def upsample(in_tens, out_HW=(64, 64)): # assumes scale factor is same for H and W
+ return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)
def normalize_tensor(in_feat, eps=1e-10):
@@ -246,10 +227,15 @@ class NetLinLayer(nn.Layer):
def __init__(self, chn_in, chn_out=1, use_dropout=False):
super(NetLinLayer, self).__init__()
- layers = ([nn.Dropout(), ] if (use_dropout) else [])
+ layers = (
+ [
+ nn.Dropout(),
+ ]
+ if (use_dropout)
+ else []
+ )
layers += [
- nn.Conv2D(
- chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
+ nn.Conv2D(chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
]
self.model = nn.Sequential(*layers)
@@ -262,14 +248,12 @@ def __init__(self):
super(ScalingLayer, self).__init__()
self.register_buffer(
"shift",
- paddle.to_tensor(
- np.asarray([-0.030, -0.088, -0.188]).astype("float32")[
- None, :, None, None]), )
+ paddle.to_tensor(np.asarray([-0.030, -0.088, -0.188]).astype("float32")[None, :, None, None]),
+ )
self.register_buffer(
"scale",
- paddle.to_tensor(
- np.asarray([0.458, 0.448, 0.450]).astype("float32")[
- None, :, None, None]), )
+ paddle.to_tensor(np.asarray([0.458, 0.448, 0.450]).astype("float32")[None, :, None, None]),
+ )
def forward(self, inp):
return (inp - self.shift) / self.scale
@@ -280,8 +264,7 @@ def __init__(self, pretrained=True, requires_grad=False):
super(VGG16, self).__init__()
vgg_model = paddle.vision.models.vgg16(pretrained=False)
if pretrained:
- state_dict = paddle.load(
- get_weights_path_from_url(*model_urls["vgg16"]))
+ state_dict = paddle.load(get_weights_path_from_url(*model_urls["vgg16"]))
vgg_model.set_state_dict(state_dict)
vgg_pretrained_features = vgg_model.features
self.slice1 = nn.Sequential()
@@ -315,9 +298,7 @@ def forward(self, X):
h_relu4_3 = h
h = self.slice5(h)
h_relu5_3 = h
- vgg_outputs = namedtuple(
- "VggOutputs",
- ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
+ vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
return out
@@ -325,25 +306,27 @@ def forward(self, X):
class LPIPS(nn.Layer):
def __init__(
- self,
- pretrained=True,
- net="alex",
- lpips=True,
- spatial=False,
- pnet_rand=False,
- pnet_tune=False,
- use_dropout=True,
- model_path=None,
- eval_mode=True,
- verbose=True, ):
+ self,
+ pretrained=True,
+ net="alex",
+ lpips=True,
+ spatial=False,
+ pnet_rand=False,
+ pnet_tune=False,
+ use_dropout=True,
+ model_path=None,
+ eval_mode=True,
+ verbose=True,
+ ):
# lpips - [True] means with linear calibration on top of base network
# pretrained - [True] means load linear weights
super(LPIPS, self).__init__()
if verbose:
- print("Setting up [%s] perceptual loss: trunk [%s], spatial [%s]" %
- ("LPIPS" if lpips else "baseline", net, "on"
- if spatial else "off"))
+ print(
+ "Setting up [%s] perceptual loss: trunk [%s], spatial [%s]"
+ % ("LPIPS" if lpips else "baseline", net, "on" if spatial else "off")
+ )
self.pnet_type = net.lower()
self.pnet_tune = pnet_tune
@@ -359,8 +342,7 @@ def __init__(
raise NotImplementedError
self.L = len(self.chns)
- self.net = net_type(
- pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
+ self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
if lpips:
lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
@@ -377,8 +359,7 @@ def __init__(
if pretrained:
if model_path is None:
- model_path = get_weights_path_from_url(*model_urls[
- "vgg_netlin"])
+ model_path = get_weights_path_from_url(*model_urls["vgg_netlin"])
if verbose:
print("Loading model from: %s" % model_path)
import warnings
@@ -393,47 +374,29 @@ def __init__(
param.stop_gradient = True
def forward(self, in0, in1, retPerLayer=False, normalize=False):
- if (normalize): # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+ if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
in0 = 2 * in0 - 1
in1 = 2 * in1 - 1
# v0.0 - original release had a bug, where input was not scaled
- in0_input, in1_input = (self.scaling_layer(in0),
- self.scaling_layer(in1))
+ in0_input, in1_input = (self.scaling_layer(in0), self.scaling_layer(in1))
outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
feats0, feats1, diffs = {}, {}, {}
for kk in range(self.L):
- feats0[kk], feats1[kk] = normalize_tensor(outs0[
- kk]), normalize_tensor(outs1[kk])
- diffs[kk] = (feats0[kk] - feats1[kk])**2
+ feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+ diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
if self.lpips:
if self.spatial:
- res = [
- upsample(
- self.lins[kk](diffs[kk]), out_HW=in0.shape[2:])
- for kk in range(self.L)
- ]
+ res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
else:
- res = [
- spatial_average(
- self.lins[kk](diffs[kk]), keepdim=True)
- for kk in range(self.L)
- ]
+ res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
else:
if self.spatial:
- res = [
- upsample(
- diffs[kk].sum(axis=1, keepdim=True),
- out_HW=in0.shape[2:]) for kk in range(self.L)
- ]
+ res = [upsample(diffs[kk].sum(axis=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
else:
- res = [
- spatial_average(
- diffs[kk].sum(axis=1, keepdim=True), keepdim=True)
- for kk in range(self.L)
- ]
+ res = [spatial_average(diffs[kk].sum(axis=1, keepdim=True), keepdim=True) for kk in range(self.L)]
val = res[0]
for l in range(1, self.L):
@@ -447,19 +410,20 @@ def forward(self, in0, in1, retPerLayer=False, normalize=False):
class LPIPSWithDiscriminator(nn.Layer):
def __init__(
- self,
- disc_start,
- logvar_init=0.0,
- kl_weight=1.0,
- pixelloss_weight=1.0,
- disc_num_layers=3,
- disc_in_channels=3,
- disc_factor=1.0,
- disc_weight=1.0,
- perceptual_weight=1.0,
- use_actnorm=False,
- disc_conditional=False,
- disc_loss="hinge", ):
+ self,
+ disc_start,
+ logvar_init=0.0,
+ kl_weight=1.0,
+ pixelloss_weight=1.0,
+ disc_num_layers=3,
+ disc_in_channels=3,
+ disc_factor=1.0,
+ disc_weight=1.0,
+ perceptual_weight=1.0,
+ use_actnorm=False,
+ disc_conditional=False,
+ disc_loss="hinge",
+ ):
super().__init__()
assert disc_loss in ["hinge", "vanilla"]
@@ -471,15 +435,13 @@ def __init__(
self.perceptual_weight = perceptual_weight
self.discriminator = NLayerDiscriminator(
- input_nc=disc_in_channels,
- n_layers=disc_num_layers,
- use_actnorm=use_actnorm)
+ input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
+ )
reset_initialized_parameter(self.discriminator)
self.discriminator.apply(weights_init)
# output log variance
- self.logvar = self.create_parameter(
- (1, ), default_initializer=nn.initializer.Constant(logvar_init))
+ self.logvar = self.create_parameter((1,), default_initializer=nn.initializer.Constant(logvar_init))
self.discriminator_iter_start = disc_start
self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
@@ -489,15 +451,11 @@ def __init__(
def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
if last_layer is not None:
- nll_grads = paddle.autograd.grad(
- nll_loss, last_layer, retain_graph=True)[0]
- g_grads = paddle.autograd.grad(
- g_loss, last_layer, retain_graph=True)[0]
+ nll_grads = paddle.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+ g_grads = paddle.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
else:
- nll_grads = paddle.autograd.grad(
- nll_loss, self.last_layer[0], retain_graph=True)[0]
- g_grads = paddle.autograd.grad(
- g_loss, self.last_layer[0], retain_graph=True)[0]
+ nll_grads = paddle.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+ g_grads = paddle.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
d_weight = paddle.norm(nll_grads) / (paddle.norm(g_grads) + 1e-4)
d_weight = paddle.clip(d_weight, 0.0, 1e4).detach()
@@ -505,16 +463,17 @@ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
return d_weight
def forward(
- self,
- inputs,
- reconstructions,
- posteriors,
- optimizer_idx,
- global_step,
- last_layer=None,
- cond=None,
- split="train",
- weights=None, ):
+ self,
+ inputs,
+ reconstructions,
+ posteriors,
+ optimizer_idx,
+ global_step,
+ last_layer=None,
+ cond=None,
+ split="train",
+ weights=None,
+ ):
rec_loss = paddle.abs(inputs - reconstructions)
if self.perceptual_weight > 0:
p_loss = self.perceptual_loss(inputs, reconstructions)
@@ -525,8 +484,7 @@ def forward(
weighted_nll_loss = nll_loss
if weights is not None:
weighted_nll_loss = weights * nll_loss
- weighted_nll_loss = paddle.sum(
- weighted_nll_loss) / weighted_nll_loss.shape[0]
+ weighted_nll_loss = paddle.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
nll_loss = paddle.sum(nll_loss) / nll_loss.shape[0]
kl_loss = posteriors.kl()
kl_loss = paddle.sum(kl_loss) / kl_loss.shape[0]
@@ -539,37 +497,28 @@ def forward(
logits_fake = self.discriminator(reconstructions)
else:
assert self.disc_conditional
- logits_fake = self.discriminator(
- paddle.concat(
- (reconstructions, cond), axis=1))
+ logits_fake = self.discriminator(paddle.concat((reconstructions, cond), axis=1))
g_loss = -paddle.mean(logits_fake)
if self.disc_factor > 0.0:
try:
- d_weight = self.calculate_adaptive_weight(
- nll_loss, g_loss, last_layer=last_layer)
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
except Exception:
assert not self.training
d_weight = paddle.to_tensor(0.0)
else:
d_weight = paddle.to_tensor(0.0)
- disc_factor = adopt_weight(
- self.disc_factor,
- global_step,
- threshold=self.discriminator_iter_start)
- loss = (weighted_nll_loss + self.kl_weight * kl_loss + d_weight *
- disc_factor * g_loss)
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+ loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
log = {
- "{}/total_loss".format(split):
- loss.clone().detach().mean().item(),
+ "{}/total_loss".format(split): loss.clone().detach().mean().item(),
"{}/logvar".format(split): self.logvar.detach().item(),
"{}/kl_loss".format(split): kl_loss.detach().mean().item(),
"{}/nll_loss".format(split): nll_loss.detach().mean().item(),
"{}/rec_loss".format(split): rec_loss.detach().mean().item(),
"{}/d_weight".format(split): d_weight.detach().item(),
- "{}/disc_factor".format(split):
- paddle.to_tensor(disc_factor).item(),
+ "{}/disc_factor".format(split): paddle.to_tensor(disc_factor).item(),
"{}/g_loss".format(split): g_loss.detach().mean().item(),
}
return loss, log
@@ -580,24 +529,14 @@ def forward(
logits_real = self.discriminator(inputs.detach())
logits_fake = self.discriminator(reconstructions.detach())
else:
- logits_real = self.discriminator(
- paddle.concat(
- (inputs.detach(), cond), axis=1))
- logits_fake = self.discriminator(
- paddle.concat(
- (reconstructions.detach(), cond), axis=1))
- disc_factor = adopt_weight(
- self.disc_factor,
- global_step,
- threshold=self.discriminator_iter_start)
+ logits_real = self.discriminator(paddle.concat((inputs.detach(), cond), axis=1))
+ logits_fake = self.discriminator(paddle.concat((reconstructions.detach(), cond), axis=1))
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
log = {
- "{}/disc_loss".format(split):
- d_loss.clone().detach().mean().item(),
- "{}/logits_real".format(split):
- logits_real.detach().mean().item(),
- "{}/logits_fake".format(split):
- logits_fake.detach().mean().item(),
+ "{}/disc_loss".format(split): d_loss.clone().detach().mean().item(),
+ "{}/logits_real".format(split): logits_real.detach().mean().item(),
+ "{}/logits_fake".format(split): logits_fake.detach().mean().item(),
}
return d_loss, log
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/model.py b/ppdiffusers/examples/autoencoder/vae/ldm/model.py
index 81cd75c9787bc..5df1c98fe4c61 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/model.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/model.py
@@ -22,8 +22,13 @@
from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
from ppdiffusers.initializer import reset_initialized_parameter
from ppdiffusers.models.autoencoder_kl import (
- AutoencoderKLOutput, Decoder, DecoderOutput, DiagonalGaussianDistribution,
- Encoder)
+ AutoencoderKLOutput,
+ Decoder,
+ DecoderOutput,
+ DiagonalGaussianDistribution,
+ Encoder,
+)
+
# from ppdiffusers.models.ema import LitEma
from ppdiffusers.models.modeling_utils import ModelMixin
@@ -33,8 +38,7 @@
def count_params(model, verbose=True):
total_params = sum(p.numel() for p in model.parameters()).item()
if verbose:
- print(
- f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+ print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
return total_params
@@ -44,59 +48,62 @@ class AutoencoderKLWithLoss(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- in_channels: int=3,
- out_channels: int=3,
- down_block_types: Tuple[str]=(
- "DownEncoderBlock2D",
- "DownEncoderBlock2D",
- "DownEncoderBlock2D",
- "DownEncoderBlock2D", ),
- down_block_out_channels: Tuple[int]=None,
- up_block_types: Tuple[str]=(
- "UpDecoderBlock2D",
- "UpDecoderBlock2D",
- "UpDecoderBlock2D",
- "UpDecoderBlock2D", ),
- up_block_out_channels: Tuple[int]=None,
- block_out_channels: Tuple[int]=(128, 256, 512, 512),
- layers_per_block: int=2,
- act_fn: str="silu",
- latent_channels: int=4,
- norm_num_groups: int=32,
- sample_size: int=512,
- # new add
- input_size: Tuple[int]=None,
- # loss arguments
- disc_start=50001,
- kl_weight=1.0e-6,
- disc_weight=0.5,
- logvar_init=0.0,
- pixelloss_weight=1.0,
- disc_num_layers=3,
- disc_in_channels=3,
- disc_factor=1.0,
- perceptual_weight=1.0,
- use_actnorm=False,
- disc_conditional=False,
- disc_loss="hinge",
- use_ema=False,
- ema_decay=None, ):
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = (
+ "DownEncoderBlock2D",
+ "DownEncoderBlock2D",
+ "DownEncoderBlock2D",
+ "DownEncoderBlock2D",
+ ),
+ down_block_out_channels: Tuple[int] = None,
+ up_block_types: Tuple[str] = (
+ "UpDecoderBlock2D",
+ "UpDecoderBlock2D",
+ "UpDecoderBlock2D",
+ "UpDecoderBlock2D",
+ ),
+ up_block_out_channels: Tuple[int] = None,
+ block_out_channels: Tuple[int] = (128, 256, 512, 512),
+ layers_per_block: int = 2,
+ act_fn: str = "silu",
+ latent_channels: int = 4,
+ norm_num_groups: int = 32,
+ sample_size: int = 512,
+ # new add
+ input_size: Tuple[int] = None,
+ # loss arguments
+ disc_start=50001,
+ kl_weight=1.0e-6,
+ disc_weight=0.5,
+ logvar_init=0.0,
+ pixelloss_weight=1.0,
+ disc_num_layers=3,
+ disc_in_channels=3,
+ disc_factor=1.0,
+ perceptual_weight=1.0,
+ use_actnorm=False,
+ disc_conditional=False,
+ disc_loss="hinge",
+ use_ema=False,
+ ema_decay=None,
+ ):
super().__init__()
- self.input_size = ([int(_) for _ in input_size]
- if input_size is not None else None)
+ self.input_size = [int(_) for _ in input_size] if input_size is not None else None
self.encoder = Encoder(
in_channels=in_channels,
out_channels=latent_channels,
down_block_types=down_block_types,
block_out_channels=down_block_out_channels
- if down_block_out_channels is
- not None # if down_block_out_channels not givien, we will use block_out_channels
+ if down_block_out_channels
+ is not None # if down_block_out_channels not givien, we will use block_out_channels
else block_out_channels,
layers_per_block=layers_per_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
- double_z=True, )
+ double_z=True,
+ )
# pass init params to Decoder
self.decoder = Decoder(
@@ -104,10 +111,12 @@ def __init__(
out_channels=out_channels,
up_block_types=up_block_types,
block_out_channels=up_block_out_channels # if up_block_out_channels not givien, we will use block_out_channels
- if up_block_out_channels is not None else block_out_channels,
+ if up_block_out_channels is not None
+ else block_out_channels,
layers_per_block=layers_per_block,
norm_num_groups=norm_num_groups,
- act_fn=act_fn, )
+ act_fn=act_fn,
+ )
self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
@@ -125,7 +134,8 @@ def __init__(
perceptual_weight=perceptual_weight,
use_actnorm=use_actnorm,
disc_conditional=disc_conditional,
- disc_loss=disc_loss, )
+ disc_loss=disc_loss,
+ )
count_params(self)
self.init_weights()
self.use_ema = use_ema
@@ -143,9 +153,10 @@ def init_weights(self):
reset_initialized_parameter(self.post_quant_conv)
def custom_forward(
- self,
- sample: paddle.Tensor,
- sample_posterior: bool=True, ):
+ self,
+ sample: paddle.Tensor,
+ sample_posterior: bool = True,
+ ):
posterior = self.encode(sample).latent_dist
if sample_posterior:
z = posterior.sample()
@@ -183,8 +194,7 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
if self.input_size is None:
encoder_inputs = pixel_values
else:
- encoder_inputs = F.interpolate(
- pixel_values, size=self.input_size, mode="bilinear")
+ encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
reconstructions, posterior = self.custom_forward(encoder_inputs)
@@ -197,7 +207,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
optimizer_idx,
global_step,
last_layer=self.get_last_layer(),
- split="train", )
+ split="train",
+ )
return aeloss, log_dict_ae
if optimizer_idx == 1:
@@ -209,7 +220,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
optimizer_idx,
global_step,
last_layer=self.get_last_layer(),
- split="train", )
+ split="train",
+ )
return discloss, log_dict_disc
@paddle.no_grad()
@@ -219,21 +231,18 @@ def log_images(self, pixel_values, only_inputs=False, **kwargs):
if self.input_size is None:
encoder_inputs = pixel_values
else:
- encoder_inputs = F.interpolate(
- pixel_values, size=self.input_size, mode="bilinear")
+ encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
if not only_inputs:
xrec, posterior = self.custom_forward(encoder_inputs)
- log["samples"] = self.decode_image(
- self.decode(paddle.randn(posterior.sample().shape)).sample)
+ log["samples"] = self.decode_image(self.decode(paddle.randn(posterior.sample().shape)).sample)
log["reconstructions"] = self.decode_image(xrec)
if self.use_ema:
with self.ema_scope():
- xrec_ema, posterior_ema = self.custom_forward(
- encoder_inputs)
+ xrec_ema, posterior_ema = self.custom_forward(encoder_inputs)
log["samples_ema"] = self.decode_image(
- self.decode(
- paddle.randn(posterior_ema.sample().shape)).sample)
+ self.decode(paddle.randn(posterior_ema.sample().shape)).sample
+ )
log["reconstructions_ema"] = self.decode_image(xrec_ema)
# update
log["encoder_inputs"] = self.decode_image(encoder_inputs)
@@ -247,12 +256,10 @@ def decode_image(self, image):
@paddle.no_grad()
def validation_step(self, pixel_values, global_step=0):
- log_dict_ae, log_dict_disc = self._validation_step(pixel_values,
- global_step)
+ log_dict_ae, log_dict_disc = self._validation_step(pixel_values, global_step)
if self.use_ema:
with self.ema_scope():
- log_dict_ae_ema, log_dict_disc_ema = self._validation_step(
- pixel_values, global_step, postfix="_ema")
+ log_dict_ae_ema, log_dict_disc_ema = self._validation_step(pixel_values, global_step, postfix="_ema")
log_dict_ae.update(log_dict_ae_ema)
log_dict_disc.update(log_dict_disc_ema)
@@ -263,8 +270,7 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
if self.input_size is None:
encoder_inputs = pixel_values
else:
- encoder_inputs = F.interpolate(
- pixel_values, size=self.input_size, mode="bilinear")
+ encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
reconstructions, posterior = self.custom_forward(encoder_inputs)
aeloss, log_dict_ae = self.loss(
@@ -274,7 +280,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
0,
global_step,
last_layer=self.get_last_layer(),
- split="val" + postfix, )
+ split="val" + postfix,
+ )
discloss, log_dict_disc = self.loss(
pixel_values,
@@ -283,7 +290,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
1,
global_step,
last_layer=self.get_last_layer(),
- split="val" + postfix, )
+ split="val" + postfix,
+ )
self.train()
return log_dict_ae, log_dict_disc
@@ -333,26 +341,25 @@ def untoggle_optimizer(self, optimizers, optimizer_idx):
if optimizer_idx != opt_idx:
for param in opt._parameter_list:
if param in self._param_stop_gradient_state:
- param.stop_gradient = self._param_stop_gradient_state[
- param]
+ param.stop_gradient = self._param_stop_gradient_state[param]
# save memory
self._param_stop_gradient_state = {}
- def encode(self, x: paddle.Tensor, return_dict: bool=True):
+ def encode(self, x: paddle.Tensor, return_dict: bool = True):
h = self.encoder(x)
moments = self.quant_conv(h)
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
- return (posterior, )
+ return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
- def decode(self, z: paddle.Tensor, return_dict: bool=True):
+ def decode(self, z: paddle.Tensor, return_dict: bool = True):
z = self.post_quant_conv(z)
dec = self.decoder(z)
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
index 8d3f4a8f4ac7a..4a91b34df3acc 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
@@ -77,22 +77,25 @@ def _get_param(self, img, output_size):
class TextImagePair(IterableDataset):
def __init__(
- self,
- file_list,
- size,
- num_records,
- image_processing=None,
- buffer_size=1000,
- shuffle_every_n_samples=5,
- interpolation="lanczos", ):
+ self,
+ file_list,
+ size,
+ num_records,
+ image_processing=None,
+ buffer_size=1000,
+ shuffle_every_n_samples=5,
+ interpolation="lanczos",
+ ):
self.size = size
if image_processing is None:
- self.image_processing = transforms.Compose([
- transforms.Resize(int(size / 0.9), interpolation),
- RandomCrop(size),
- transforms.ToTensor(),
- transforms.Normalize(0.5, 0.5),
- ])
+ self.image_processing = transforms.Compose(
+ [
+ transforms.Resize(int(size / 0.9), interpolation),
+ RandomCrop(size),
+ transforms.ToTensor(),
+ transforms.Normalize(0.5, 0.5),
+ ]
+ )
else:
self.image_processing = image_processing
self.file_list = []
@@ -115,19 +118,14 @@ def __init__(
file_weights = file_weights / file_weight_sum
print(f"sample weights of files: {file_weights}")
self.file_weights_cumsum = np.cumsum(file_weights)
- self.file_weights_cumsum = np.concatenate(
- [[0.0], self.file_weights_cumsum])
+ self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
else:
print("sample each file list with same probabiliy")
self.file_weights_cumsum = None
self.num_records = num_records
- self.file_ids = [
- np.arange(len(filelist)) for filelist in self.file_list
- ]
- print(
- f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
- )
+ self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+ print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
self.buffer_size = buffer_size
self.shuffle_every_n_samples = shuffle_every_n_samples
@@ -136,9 +134,7 @@ def sample_loader(self, file_ids, filenames):
random.shuffle(file_ids)
for i in file_ids:
filename = filenames[i].strip("\n")
- with gzip.open(filename,
- "rb") if filename.endswith(".gz") else open(
- filename, "rb") as f:
+ with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
retry = 0
while True:
line = f.readline()
@@ -167,12 +163,9 @@ def sample_loader(self, file_ids, filenames):
yield data
def random_load_from_multi_dataset(self):
- print(
- f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
- )
+ print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
sample_loader_per_dataset = [
- iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
- for i in range(len(self.file_ids))
+ iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
]
while True:
@@ -181,8 +174,7 @@ def random_load_from_multi_dataset(self):
else:
rand_num = random.random()
for i in range(len(self.file_list)):
- if (self.file_weights_cumsum[i] <= rand_num <
- self.file_weights_cumsum[i + 1]):
+ if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
break
sample_loader = sample_loader_per_dataset[i]
# debug
@@ -211,8 +203,7 @@ def __iter__(self):
return self.shuffle(iter(self.random_load_from_multi_dataset()))
-def split_data_per_worker(dataset, worker_id, local_rank, world_size,
- num_workers):
+def split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers):
worker_global_id = local_rank * num_workers + worker_id
dataset.rng = np.random.RandomState(worker_global_id)
for i in range(len(dataset.file_ids)):
@@ -238,8 +229,7 @@ def worker_init_fn(_):
world_size = dist.get_world_size()
num_workers = worker_info.num_workers
if isinstance(dataset, TextImagePair):
- split_data_per_worker(dataset, worker_id, local_rank, world_size,
- num_workers)
+ split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers)
return np.random.seed(np.random.get_state()[1][0] + worker_id)
else:
return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
index 08141d43c821e..ebfb3ff1df677 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
@@ -40,9 +40,7 @@ def reorder_image(img, input_order="HWC"):
"""
if input_order not in ["HWC", "CHW"]:
- raise ValueError(
- f"Wrong input_order {input_order}. Supported input_orders are "
- "'HWC' and 'CHW'")
+ raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " "'HWC' and 'CHW'")
if len(img.shape) == 2:
img = img[..., None]
if input_order == "CHW":
@@ -68,12 +66,9 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs):
float: psnr result.
"""
- assert (img.shape == img2.shape
- ), f"Image shapes are different: {img.shape}, {img2.shape}."
+ assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
if input_order not in ["HWC", "CHW"]:
- raise ValueError(
- f"Wrong input_order {input_order}. Supported input_orders are "
- '"HWC" and "CHW"')
+ raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
img = reorder_image(img, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
img = img.astype(np.float64)
@@ -83,7 +78,7 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs):
img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
- mse = np.mean((img - img2)**2)
+ mse = np.mean((img - img2) ** 2)
if mse == 0:
return float("inf")
return 20.0 * np.log10(255.0 / np.sqrt(mse))
@@ -102,8 +97,8 @@ def _ssim(img, img2):
float: ssim result.
"""
- c1 = (0.01 * 255)**2
- c2 = (0.03 * 255)**2
+ c1 = (0.01 * 255) ** 2
+ c2 = (0.03 * 255) ** 2
img = img.astype(np.float64)
img2 = img2.astype(np.float64)
@@ -119,8 +114,7 @@ def _ssim(img, img2):
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
- ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / (
- (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
+ ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
return ssim_map.mean()
@@ -149,12 +143,9 @@ def calculate_ssim(img, img2, crop_border, input_order="HWC", **kwargs):
float: ssim result.
"""
- assert (img.shape == img2.shape
- ), f"Image shapes are different: {img.shape}, {img2.shape}."
+ assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
if input_order not in ["HWC", "CHW"]:
- raise ValueError(
- f"Wrong input_order {input_order}. Supported input_orders are "
- '"HWC" and "CHW"')
+ raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
img = reorder_image(img, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
img = img.astype(np.float64)
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
index d466ef6155819..d239d53cf5fcf 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
@@ -53,8 +53,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -70,8 +69,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -113,8 +111,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -122,21 +119,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming
to them. It splits attention layers, and takes into account additional replacements
that may arise.
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -144,13 +140,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = old_tensor.split(channels // num_heads, dim=1)
checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -161,8 +155,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -172,8 +165,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -202,14 +194,10 @@ def create_vae_diffusers_config(original_config):
decoder_vae_params = original_config.model.params.ddconfig.decoder
vae_params = decoder_vae_params
- encoder_block_out_channels = [
- encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult
- ]
+ encoder_block_out_channels = [encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult]
down_block_types = ["DownEncoderBlock2D"] * len(encoder_block_out_channels)
- decoder_block_out_channels = [
- decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult
- ]
+ decoder_block_out_channels = [decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult]
up_block_types = ["UpDecoderBlock2D"] * len(decoder_block_out_channels)
config = dict(
@@ -222,114 +210,82 @@ def create_vae_diffusers_config(original_config):
down_block_out_channels=tuple(encoder_block_out_channels),
up_block_out_channels=tuple(decoder_block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
def convert_ldm_vae_checkpoint(vae_state_dict, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -337,58 +293,50 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -396,14 +344,13 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -442,7 +389,8 @@ def check_keys(model, state_dict):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--original_config_file",
default="../config/f8encoder_f16decoder.yaml",
@@ -453,13 +401,15 @@ def check_keys(model, state_dict):
"--dtype",
default="float32",
type=str,
- help="Dtype of model weights.", )
+ help="Dtype of model weights.",
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
@@ -469,11 +419,9 @@ def check_keys(model, state_dict):
vae_config = create_vae_diffusers_config(original_config)
# 1. convert vae encoder and decoder
- diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL.from_config(vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint, args.dtype)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint, args.dtype)
# 2. convert losses
maps = {
@@ -491,7 +439,7 @@ def check_keys(model, state_dict):
k = k.replace(old, new)
# paddle donot support 0d tensor
if v.ndim == 0:
- v = v.reshape((1, ))
+ v = v.reshape((1,))
# rename
if "perceptual_loss.lin" in k:
k = k.replace("perceptual_loss.lin", "perceptual_loss.lins.")
@@ -501,5 +449,4 @@ def check_keys(model, state_dict):
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.save_config(args.dump_path)
# 4. save state_dict
- paddle.save(ppdiffusers_vae_checkpoint,
- os.path.join(args.dump_path, "model_state.pdparams"))
+ paddle.save(ppdiffusers_vae_checkpoint, os.path.join(args.dump_path, "model_state.pdparams"))
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
index 6bc24b3d88bab..0e7e08a580299 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
@@ -67,35 +67,28 @@ def tqdm(x):
from inception import InceptionV3
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
- "--batch-size", type=int, default=50, help="Batch size to use")
+parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
parser.add_argument(
"--num-workers",
type=int,
- help=("Number of processes to use for data loading. "
- "Defaults to `min(8, num_cpus)`"), )
-parser.add_argument(
- "--device",
- type=str,
- default=None,
- help="Device to use. Like gpu, gpu:0 or cpu")
+ help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
+)
+parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu, gpu:0 or cpu")
parser.add_argument(
"--dims",
type=int,
default=2048,
choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
- help=("Dimensionality of Inception features to use. "
- "By default, uses pool3 features"), )
+ help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
+)
parser.add_argument(
"path",
type=str,
nargs=2,
- help=("Paths to the generated images or "
- "to .npz statistic files"), )
+ help=("Paths to the generated images or " "to .npz statistic files"),
+)
-IMAGE_EXTENSIONS = {
- "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
class ImagePathDataset(paddle.io.Dataset):
@@ -136,8 +129,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
model.eval()
if batch_size > len(files):
- print(("Warning: batch size is bigger than the data size. "
- "Setting batch size to data size"))
+ print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
batch_size = len(files)
dataset = ImagePathDataset(files, transforms=TF.ToTensor())
@@ -146,7 +138,8 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
batch_size=batch_size,
shuffle=False,
drop_last=False,
- num_workers=num_workers, )
+ num_workers=num_workers,
+ )
pred_arr = np.empty((len(files), dims))
@@ -165,7 +158,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
pred = pred.squeeze(3).squeeze(2).cpu().numpy()
- pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+ pred_arr[start_idx : start_idx + pred.shape[0]] = pred
start_idx = start_idx + pred.shape[0]
@@ -200,18 +193,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
sigma1 = np.atleast_2d(sigma1)
sigma2 = np.atleast_2d(sigma2)
- assert (mu1.shape == mu2.shape
- ), "Training and test mean vectors have different lengths"
- assert (sigma1.shape == sigma2.shape
- ), "Training and test covariances have different dimensions"
+ assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+ assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
diff = mu1 - mu2
# Product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
- msg = ("fid calculation produces singular product; "
- "adding %s to diagonal of cov estimates") % eps
+ msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
print(msg)
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
@@ -228,11 +218,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
-def calculate_activation_statistics(files,
- model,
- batch_size=50,
- dims=2048,
- num_workers=1):
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1):
"""Calculation of the statistics used by the FID.
Params:
-- files : List of image files paths
@@ -261,13 +247,8 @@ def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1):
m, s = f["mu"][:], f["sigma"][:]
else:
path = pathlib.Path(path)
- files = sorted([
- file
- for ext in IMAGE_EXTENSIONS
- for file in path.glob("*.{}".format(ext))
- ])
- m, s = calculate_activation_statistics(files, model, batch_size, dims,
- num_workers)
+ files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
+ m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers)
return m, s
@@ -282,10 +263,8 @@ def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1):
model = InceptionV3([block_idx])
- m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims,
- num_workers)
- m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims,
- num_workers)
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers)
+ m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers)
fid_value = calculate_frechet_distance(m1, s1, m2, s2)
return fid_value
@@ -302,8 +281,7 @@ def main():
else:
num_workers = args.num_workers
- fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims,
- num_workers)
+ fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, num_workers)
print("FID: ", fid_value)
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
index 3eb58b8b7de40..7e5eadaf365b2 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
@@ -25,15 +25,16 @@
from ppdiffusers import AutoencoderKL, StableDiffusionImg2ImgPipeline
-image_processing = transforms.Compose([
- transforms.ToTensor(),
- transforms.Normalize(0.5, 0.5),
-])
+image_processing = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ transforms.Normalize(0.5, 0.5),
+ ]
+)
def decode_image(image):
- image = (image / 2 + 0.5).clip(0, 1).transpose(
- [0, 2, 3, 1]).cast("float32").numpy()
+ image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]).cast("float32").numpy()
image = StableDiffusionImg2ImgPipeline.numpy_to_pil(image)
return image
@@ -62,8 +63,7 @@ def main(vae_path, src_size, tgt_size, imgs, outdir):
z = model.encode(img).latent_dist.sample()
recon = model.decode(z).sample
- decode_image(recon)[0].save(
- osp.join(outdir, osp.basename(img_path)))
+ decode_image(recon)[0].save(osp.join(outdir, osp.basename(img_path)))
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
index 9aecdf265779a..bbdff9a933432 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
@@ -21,7 +21,8 @@
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
FID_WEIGHTS_URL = (
"https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
- "8e2ae24c34c5c8b81d45167bb9361f4c", )
+ "8e2ae24c34c5c8b81d45167bb9361f4c",
+)
WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
@@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential):
"""
def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=None,
- groups=1,
- norm_layer=nn.BatchNorm2D,
- activation_layer=nn.ReLU,
- dilation=1,
- bias=None, ):
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=None,
+ groups=1,
+ norm_layer=nn.BatchNorm2D,
+ activation_layer=nn.ReLU,
+ dilation=1,
+ bias=None,
+ ):
if padding is None:
padding = (kernel_size - 1) // 2 * dilation
if bias is None:
@@ -71,7 +73,8 @@ def __init__(
padding,
dilation=dilation,
groups=groups,
- bias_attr=bias, )
+ bias_attr=bias,
+ )
]
if norm_layer is not None:
# The hyperparameter of BatchNorm2D is different from PaddlePaddle.
@@ -97,12 +100,13 @@ class InceptionV3(nn.Layer):
}
def __init__(
- self,
- output_blocks=(DEFAULT_BLOCK_INDEX, ),
- resize_input=True,
- normalize_input=True,
- requires_grad=False,
- use_fid_inception=True, ):
+ self,
+ output_blocks=(DEFAULT_BLOCK_INDEX,),
+ resize_input=True,
+ normalize_input=True,
+ requires_grad=False,
+ use_fid_inception=True,
+ ):
"""Build pretrained InceptionV3
Parameters
@@ -211,8 +215,7 @@ def forward(self, inp):
outp = []
x = inp
if self.resize_input:
- x = F.interpolate(
- x, size=(299, 299), mode="bilinear", align_corners=False)
+ x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
if self.normalize_input:
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
@@ -235,8 +238,7 @@ def hack_bn_layer(layer):
def _inception_v3(*args, **kwargs):
"""Wraps `paddle.vision.models.inception_v3`"""
- return paddle.vision.models.inception_v3(*args,
- **kwargs).apply(hack_bn_layer)
+ return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
def fid_inception_v3():
@@ -248,8 +250,7 @@ def fid_inception_v3():
This method first constructs paddle.vision's Inception and then patches the
necessary parts that are different in the FID Inception model.
"""
- inception = _inception_v3(
- num_classes=1008, with_pool=True, pretrained=False)
+ inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
inception.inception_block_list[0] = InceptionA(192, pool_features=32)
inception.inception_block_list[1] = InceptionA(256, pool_features=64)
inception.inception_block_list[2] = InceptionA(288, pool_features=64)
@@ -260,8 +261,7 @@ def fid_inception_v3():
inception.inception_block_list[9] = InceptionE_1(1280)
inception.inception_block_list[10] = InceptionE_2(2048)
- weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0],
- FID_WEIGHTS_URL[1])
+ weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
state_dict = paddle.load(weight_path)
inception.set_state_dict(state_dict)
return inception
@@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features):
out_channels=64,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch5x5_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=48,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch5x5_2 = ConvNormActivation(
in_channels=48,
out_channels=64,
kernel_size=5,
padding=2,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=64,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_2 = ConvNormActivation(
in_channels=64,
out_channels=96,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3 = ConvNormActivation(
in_channels=96,
out_channels=96,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=pool_features,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -330,8 +336,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
return x
@@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7):
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_1 = ConvNormActivation(
in_channels=num_channels,
@@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7):
kernel_size=1,
stride=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_2 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(1, 7),
stride=1,
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_3 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=192,
kernel_size=(7, 1),
stride=1,
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=channels_7x7,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_2 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(7, 1),
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_3 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(1, 7),
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_4 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(7, 1),
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_5 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=192,
kernel_size=(1, 7),
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -424,8 +438,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
return x
@@ -438,61 +451,69 @@ def __init__(self, num_channels):
out_channels=320,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=384,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_2a = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(1, 3),
padding=(0, 1),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_2b = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(3, 1),
padding=(1, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=448,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_2 = ConvNormActivation(
in_channels=448,
out_channels=384,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3a = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(1, 3),
padding=(0, 1),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3b = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(3, 1),
padding=(1, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -515,8 +536,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
return x
@@ -549,6 +569,5 @@ def forward(self, x):
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
return x
diff --git a/ppdiffusers/examples/autoencoder/vae/train_vae.py b/ppdiffusers/examples/autoencoder/vae/train_vae.py
index e96c6718040c0..44a8798100e3a 100644
--- a/ppdiffusers/examples/autoencoder/vae/train_vae.py
+++ b/ppdiffusers/examples/autoencoder/vae/train_vae.py
@@ -28,8 +28,7 @@
from tqdm.auto import tqdm
from ppdiffusers.models.ema import LitEma
-from ppdiffusers.training_utils import (freeze_params, main_process_first,
- unwrap_model)
+from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
def read_json(file):
@@ -56,8 +55,7 @@ def run_evaluate(vae, val_dataloader, writer, global_step):
log_dict_ae_all = defaultdict(list)
log_dict_disc_all = defaultdict(list)
for batch in val_dataloader:
- log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(
- batch["image"], global_step=global_step)
+ log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(batch["image"], global_step=global_step)
for k, v in log_dict_ae.items():
if "loss" not in k:
continue
@@ -71,25 +69,21 @@ def run_evaluate(vae, val_dataloader, writer, global_step):
def parse_args():
- parser = argparse.ArgumentParser(
- description="Simple example of a training a autoencoder model script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training a autoencoder model script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
default=None,
required=False,
- help="Path to pretrained model or model identifier from bos.", )
+ help="Path to pretrained model or model identifier from bos.",
+ )
parser.add_argument(
"--output_dir",
type=str,
default="autoencoder_outputs",
help="The output directory where the model predictions and checkpoints will be written.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=23,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=23, help="A seed for reproducible training.")
parser.add_argument(
"--batch_size",
type=int,
@@ -112,48 +106,39 @@ def parse_args():
parser.add_argument(
"--scale_lr",
action="store_true",
- help="Scale base-lr by ngpu * batch_size", )
- parser.add_argument(
- "--freeze_encoder",
- action="store_true",
- help="Whether to freeze encoder layer.")
+ help="Scale base-lr by ngpu * batch_size",
+ )
+ parser.add_argument("--freeze_encoder", action="store_true", help="Whether to freeze encoder layer.")
parser.add_argument(
"--from_scratch",
action="store_true",
- help="Whether to train new model from scratch. ", )
- parser.add_argument(
- "--vae_config_file",
- default=None,
- type=str,
- help="Path to the vae_config_file.")
+ help="Whether to train new model from scratch. ",
+ )
+ parser.add_argument("--vae_config_file", default=None, type=str, help="Path to the vae_config_file.")
parser.add_argument(
"--logging_dir",
type=str,
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
choices=["tensorboard", "visualdl"],
- help="Log writer type.", )
- parser.add_argument(
- "--logging_steps",
- default=100,
- type=int,
- help="The interval steps to logging.")
+ help="Log writer type.",
+ )
+ parser.add_argument("--logging_steps", default=100, type=int, help="The interval steps to logging.")
parser.add_argument(
"--image_logging_steps",
default=500,
type=int,
- help="The interval steps to logging images.", )
- parser.add_argument(
- "--save_steps",
- default=2000,
- type=int,
- help="The interval steps to saveing.")
+ help="The interval steps to logging images.",
+ )
+ parser.add_argument("--save_steps", default=2000, type=int, help="The interval steps to saveing.")
parser.add_argument(
"--ignore_keys",
default=[],
@@ -166,136 +151,152 @@ def parse_args():
default=None,
type=int,
nargs="*",
- help="The height and width of the input at the encoder.", )
+ help="The height and width of the input at the encoder.",
+ )
# dataset
parser.add_argument(
"--dataset_type",
type=str,
default="text_image_pair",
choices=["imagenet", "text_image_pair"],
- help="The type of dataset.", )
+ help="The type of dataset.",
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--degradation",
type=str,
default="pil_nearest",
- help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest", )
+ help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest",
+ )
parser.add_argument(
"--file_list",
type=str,
default="./data/filelist/train.filelist.list",
- help="Path to the train file_list.", )
+ help="Path to the train file_list.",
+ )
parser.add_argument(
"--num_workers",
type=int,
default=8,
- help="The number of subprocess to load data.", )
+ help="The number of subprocess to load data.",
+ )
parser.add_argument(
"--num_records",
type=int,
default=62500,
- help="The num_records of the text_image_pair dataset.", )
+ help="The num_records of the text_image_pair dataset.",
+ )
parser.add_argument(
"--buffer_size",
type=int,
default=100,
- help="The buffer size of the text_image_pair dataset.", )
+ help="The buffer size of the text_image_pair dataset.",
+ )
parser.add_argument(
"--shuffle_every_n_samples",
type=int,
default=5,
- help="The shuffle_every_n_samples of the text_image_pair dataset.", )
+ help="The shuffle_every_n_samples of the text_image_pair dataset.",
+ )
parser.add_argument(
"--init_from_ckpt",
type=str,
default=None,
- help="The path of checkpoint to be loaded.", )
+ help="The path of checkpoint to be loaded.",
+ )
# loss fn
parser.add_argument(
"--disc_start",
type=int,
default=50001,
- help="The number of steps the discriminator started.", )
+ help="The number of steps the discriminator started.",
+ )
parser.add_argument(
"--kl_weight",
type=float,
default=1.0e-6,
- help="The weight ratio of the kl_loss.", )
+ help="The weight ratio of the kl_loss.",
+ )
parser.add_argument(
"--disc_weight",
type=float,
default=0.5,
- help="The weight ratio of the disc_loss.", )
+ help="The weight ratio of the disc_loss.",
+ )
parser.add_argument(
"--logvar_init",
type=float,
default=0.0,
- help="The init value of the output log variances.", )
+ help="The init value of the output log variances.",
+ )
parser.add_argument(
"--pixelloss_weight",
type=float,
default=1.0,
- help="The weight ratio of the pixelloss.", )
+ help="The weight ratio of the pixelloss.",
+ )
parser.add_argument(
"--disc_num_layers",
type=int,
default=3,
- help="The num layers of the discriminator.", )
+ help="The num layers of the discriminator.",
+ )
parser.add_argument(
"--disc_in_channels",
type=int,
default=3,
- help="The in channels of the discriminator.", )
+ help="The in channels of the discriminator.",
+ )
parser.add_argument(
"--disc_factor",
type=float,
default=1.0,
- help="The factor of the discriminator loss.", )
+ help="The factor of the discriminator loss.",
+ )
parser.add_argument(
"--perceptual_weight",
type=float,
default=1.0,
- help="The weight ratio of the perceptual loss.", )
+ help="The weight ratio of the perceptual loss.",
+ )
parser.add_argument(
"--use_actnorm",
action="store_true",
- help="Whether to use actnorm in NLayerDiscriminator layer.", )
+ help="Whether to use actnorm in NLayerDiscriminator layer.",
+ )
parser.add_argument(
"--disc_conditional",
action="store_true",
- help="Whether to use conditional discriminator.", )
+ help="Whether to use conditional discriminator.",
+ )
parser.add_argument(
"--disc_loss",
type=str,
choices=["hinge", "vanilla"],
default="hinge",
- help="The type of discriminator loss.", )
- parser.add_argument(
- "--use_ema", action="store_true", help="Whether to use_ema.")
+ help="The type of discriminator loss.",
+ )
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use_ema.")
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether to enable_xformers_memory_efficient_attention.", )
- parser.add_argument(
- "--recompute", action="store_true", help="Whether to recompute.")
- parser.add_argument(
- "--ema_decay",
- type=float,
- default=0.9999,
- help="The value of ema_decay.")
+ help="Whether to enable_xformers_memory_efficient_attention.",
+ )
+ parser.add_argument("--recompute", action="store_true", help="Whether to recompute.")
+ parser.add_argument("--ema_decay", type=float, default=0.9999, help="The value of ema_decay.")
args = parser.parse_args()
args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
- args.image_logging_steps = (
- math.ceil(args.image_logging_steps / args.logging_steps) *
- args.logging_steps)
+ args.image_logging_steps = math.ceil(args.image_logging_steps / args.logging_steps) * args.logging_steps
return args
@@ -358,7 +359,8 @@ def main():
disc_loss=args.disc_loss,
ema_decay=args.ema_decay,
use_ema=args.use_ema,
- **model_kwargs, )
+ **model_kwargs,
+ )
else:
assert args.vae_config_file is not None, "We must supply vae_config_file!"
# Load config: train model from scatch
@@ -378,7 +380,8 @@ def main():
disc_conditional=args.disc_conditional,
disc_loss=args.disc_loss,
ema_decay=args.ema_decay,
- use_ema=args.use_ema, )
+ use_ema=args.use_ema,
+ )
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
@@ -390,8 +393,7 @@ def main():
args.learning_rate = num_processes * args.batch_size * args.learning_rate
# configure_optimizers
- parameters = list(vae.decoder.parameters()) + list(
- vae.post_quant_conv.parameters())
+ parameters = list(vae.decoder.parameters()) + list(vae.post_quant_conv.parameters())
# we may freeze_encoder
if not args.freeze_encoder:
parameters += list(vae.encoder.parameters())
@@ -401,16 +403,13 @@ def main():
freeze_params(vae.quant_conv.parameters())
print("Freeze vae.encoder.parameters and vae.quant_conv.parameters!")
- opt_ae = Adam(
- parameters=parameters,
- learning_rate=args.learning_rate,
- beta1=0.5,
- beta2=0.9)
+ opt_ae = Adam(parameters=parameters, learning_rate=args.learning_rate, beta1=0.5, beta2=0.9)
opt_disc = Adam(
parameters=vae.loss.discriminator.parameters(),
learning_rate=args.learning_rate,
beta1=0.5,
- beta2=0.9, )
+ beta2=0.9,
+ )
if args.use_ema:
vae.model_ema = LitEma(vae, decay=args.ema_decay)
if args.recompute:
@@ -427,27 +426,17 @@ def main():
from ldm import ImageNetSRTrain, ImageNetSRValidation
with main_process_first():
- train_dataset = ImageNetSRTrain(
- size=args.resolution, degradation=args.degradation)
- val_dataset = ImageNetSRValidation(
- size=args.resolution, degradation=args.degradation)
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.batch_size,
- shuffle=True))
- train_dataloader = DataLoader(
- train_dataset,
- batch_sampler=train_sampler,
- num_workers=args.num_workers)
-
- val_sampler = BatchSampler(
- val_dataset, batch_size=args.batch_size * 2, shuffle=False)
- val_dataloader = DataLoader(
- val_dataset,
- batch_sampler=val_sampler,
- num_workers=args.num_workers)
+ train_dataset = ImageNetSRTrain(size=args.resolution, degradation=args.degradation)
+ val_dataset = ImageNetSRValidation(size=args.resolution, degradation=args.degradation)
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
+ )
+ train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers)
+
+ val_sampler = BatchSampler(val_dataset, batch_size=args.batch_size * 2, shuffle=False)
+ val_dataloader = DataLoader(val_dataset, batch_sampler=val_sampler, num_workers=args.num_workers)
else:
train_dataset = TextImagePair(
file_list=args.file_list,
@@ -455,19 +444,21 @@ def main():
num_records=args.num_records,
buffer_size=args.buffer_size,
shuffle_every_n_samples=args.shuffle_every_n_samples,
- interpolation="lanczos", )
+ interpolation="lanczos",
+ )
train_dataloader = DataLoader(
train_dataset,
batch_size=args.batch_size,
num_workers=args.num_workers,
- worker_init_fn=worker_init_fn, )
+ worker_init_fn=worker_init_fn,
+ )
val_dataloader = val_dataset = None
# Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = (
- len(train_dataloader) if args.dataset_type == "imagenet" else
- math.ceil(len(train_dataset) / args.batch_size))
+ len(train_dataloader) if args.dataset_type == "imagenet" else math.ceil(len(train_dataset) / args.batch_size)
+ )
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
overrode_max_train_steps = True
@@ -475,8 +466,7 @@ def main():
if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if rank == 0:
logger.info("----------- Configuration Arguments -----------")
@@ -492,9 +482,7 @@ def main():
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(f" Instantaneous batch size per device = {args.batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed) = {total_batch_size}"
- )
+ logger.info(f" Total train batch size (w. parallel, distributed) = {total_batch_size}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
logger.info(
f" Number of trainable parameters = {sum(p.numel().item() for p in vae.parameters() if not p.stop_gradient) }"
@@ -515,9 +503,7 @@ def main():
# pytorch_lightning use this `toggle_optimizer` method
# ref: https://github.com/Lightning-AI/lightning/blob/a58639ce7e864dd70484e7d34c37730ae204183c/src/pytorch_lightning/core/module.py#L1419-L1447
unwrap_model(vae).toggle_optimizer(optimizers, optimizer_idx)
- loss, log_dict = vae(batch["image"],
- optimizer_idx=optimizer_idx,
- global_step=global_step)
+ loss, log_dict = vae(batch["image"], optimizer_idx=optimizer_idx, global_step=global_step)
optimizers[optimizer_idx].clear_grad()
loss.backward()
optimizers[optimizer_idx].step()
@@ -541,17 +527,13 @@ def main():
if global_step % args.image_logging_steps == 0:
images_log = unwrap_model(vae).log_images(batch["image"])
for name, val in images_log.items():
- writer.add_image(
- name, val, global_step, dataformats="NHWC")
+ writer.add_image(name, val, global_step, dataformats="NHWC")
# saving
if global_step % args.save_steps == 0:
if val_dataloader is not None:
- run_evaluate(
- unwrap_model(vae), val_dataloader, writer,
- global_step)
- output_dir = os.path.join(
- args.output_dir, "checkpoint-{}".format(global_step))
+ run_evaluate(unwrap_model(vae), val_dataloader, writer, global_step)
+ output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
unwrap_model(vae).save_pretrained(output_dir)
del logs
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
index 0c715dcb16fff..0c943be785d26 100644
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
+++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
@@ -13,19 +13,15 @@
@patch_to(BeamHypotheses)
-def add(self: BeamHypotheses,
- hyp: paddle.Tensor,
- sum_logprobs: float,
- origin_len: int=0) -> None:
+def add(self: BeamHypotheses, hyp: paddle.Tensor, sum_logprobs: float, origin_len: int = 0) -> None:
"""
Add a new hypothesis to the list.
"""
- score = sum_logprobs / (hyp.shape[-1]**self.length_penalty)
+ score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
if len(self) < self.num_beams or score > self.worst_score:
self.beams.append((score, hyp))
if len(self) > self.num_beams:
- sorted_next_scores = sorted(
- [(s, idx) for idx, (s, _) in enumerate(self.beams)])
+ sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
del self.beams[sorted_next_scores[0][1]]
self.worst_score = sorted_next_scores[1][0]
else:
@@ -33,10 +29,7 @@ def add(self: BeamHypotheses,
@patch_to(BeamHypotheses)
-def is_done(self: BeamHypotheses,
- best_sum_logprobs: float,
- cur_len: int,
- origin_len: int=0) -> bool:
+def is_done(self: BeamHypotheses, best_sum_logprobs: float, cur_len: int, origin_len: int = 0) -> bool:
"""
If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
one in the heap, then we are done with this sentence.
@@ -54,35 +47,31 @@ def is_done(self: BeamHypotheses,
class BLIP_Decoder(nn.Layer):
def __init__(
- self,
- pretrained_model_name_or_path,
- prompt="a picture of ", ):
+ self,
+ pretrained_model_name_or_path,
+ prompt="a picture of ",
+ ):
super().__init__()
- self.text_decoder = BlipForConditionalGeneration.from_pretrained(
- pretrained_model_name_or_path)
+ self.text_decoder = BlipForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
self.text_decoder.eval()
- self.processor = BlipProcessor.from_pretrained(
- pretrained_model_name_or_path)
+ self.processor = BlipProcessor.from_pretrained(pretrained_model_name_or_path)
self.processor.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
- self.processor.tokenizer.add_special_tokens({
- "additional_special_tokens": ["[ENC]"]
- })
- self.processor.tokenizer.enc_token_id = (
- self.processor.tokenizer.additional_special_tokens_ids[0])
+ self.processor.tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
+ self.processor.tokenizer.enc_token_id = self.processor.tokenizer.additional_special_tokens_ids[0]
self.prompt = prompt
- self.prompt_length = len(
- self.processor.tokenizer(self.prompt).input_ids) - 1
+ self.prompt_length = len(self.processor.tokenizer(self.prompt).input_ids) - 1
def generate(
- self,
- image,
- prompt=None,
- sample=False,
- num_beams=3,
- max_length=30,
- min_length=10,
- top_p=0.9,
- repetition_penalty=1.0, ):
+ self,
+ image,
+ prompt=None,
+ sample=False,
+ num_beams=3,
+ max_length=30,
+ min_length=10,
+ top_p=0.9,
+ repetition_penalty=1.0,
+ ):
if prompt is None:
prompt = self.prompt
prompt_length = self.prompt_length
@@ -93,8 +82,7 @@ def generate(
else:
model_kwargs = {"pixel_values": image}
prompt = [prompt] * model_kwargs["pixel_values"].shape[0]
- input_ids = self.processor.tokenizer(
- prompt, return_tensors="pd").input_ids
+ input_ids = self.processor.tokenizer(prompt, return_tensors="pd").input_ids
if sample:
# nucleus sampling
@@ -106,7 +94,8 @@ def generate(
top_p=top_p,
num_return_sequences=1,
repetition_penalty=repetition_penalty,
- **model_kwargs, )[0]
+ **model_kwargs,
+ )[0]
else:
if num_beams == 1:
# greedy search
@@ -115,7 +104,8 @@ def generate(
max_length=max_length - prompt_length,
min_length=min_length,
decode_strategy="greedy_search",
- **model_kwargs, )[0]
+ **model_kwargs,
+ )[0]
else:
# beam search
outputs = self.text_decoder.generate(
@@ -126,11 +116,10 @@ def generate(
decode_strategy="beam_search",
repetition_penalty=repetition_penalty,
length_penalty=1.0, # note this is not
- **model_kwargs, )[0]
+ **model_kwargs,
+ )[0]
captions = []
for output in outputs:
- captions.append(
- self.processor.decode(
- output, skip_special_tokens=True))
+ captions.append(self.processor.decode(output, skip_special_tokens=True))
return captions
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
index 380024d3d617e..9cefe1a3b543d 100644
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
+++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
@@ -63,19 +63,16 @@ def __init__(self, config: Config):
def load_blip_model(self):
config = self.config
- self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.
- blip_pretrained_model_name_or_path)
+ self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.blip_pretrained_model_name_or_path)
self.blip_model.eval()
def load_clip_model(self):
config = self.config
# clip model
- self.clip_model: CLIPModel = CLIPModel.from_pretrained(
- config.clip_pretrained_model_name_or_path)
+ self.clip_model: CLIPModel = CLIPModel.from_pretrained(config.clip_pretrained_model_name_or_path)
self.clip_model.eval()
- self.clip_preprocess = CLIPProcessor.from_pretrained(
- config.clip_pretrained_model_name_or_path)
+ self.clip_preprocess = CLIPProcessor.from_pretrained(config.clip_pretrained_model_name_or_path)
sites = [
"Artstation",
@@ -113,41 +110,45 @@ def load_clip_model(self):
return_tensors="pd",
padding="max_length",
truncation=True,
- max_length=self.clip_preprocess.tokenizer.model_max_length, )
- self.artists = LabelTable(artists, "artists", self.clip_model,
- self.tokenize, config)
+ max_length=self.clip_preprocess.tokenizer.model_max_length,
+ )
+ self.artists = LabelTable(artists, "artists", self.clip_model, self.tokenize, config)
self.flavors = LabelTable(
_load_list(config.data_path, "flavors.txt"),
"flavors",
self.clip_model,
self.tokenize,
- config, )
+ config,
+ )
self.mediums = LabelTable(
_load_list(config.data_path, "mediums.txt"),
"mediums",
self.clip_model,
self.tokenize,
- config, )
+ config,
+ )
self.movements = LabelTable(
_load_list(config.data_path, "movements.txt"),
"movements",
self.clip_model,
self.tokenize,
- config, )
- self.trendings = LabelTable(trending_list, "trendings", self.clip_model,
- self.tokenize, config)
+ config,
+ )
+ self.trendings = LabelTable(trending_list, "trendings", self.clip_model, self.tokenize, config)
self.pad_token_id = self.clip_preprocess.tokenizer.pad_token_id
def generate_caption(self, pil_image: Image) -> str:
size = self.config.blip_image_eval_size
- gpu_image = transforms.Compose([
- transforms.Resize(
- (size, size), interpolation="bicubic"),
- transforms.ToTensor(),
- transforms.Normalize(
- self.clip_preprocess.image_processor.image_mean,
- self.clip_preprocess.image_processor.image_std, ),
- ])(pil_image).unsqueeze(0)
+ gpu_image = transforms.Compose(
+ [
+ transforms.Resize((size, size), interpolation="bicubic"),
+ transforms.ToTensor(),
+ transforms.Normalize(
+ self.clip_preprocess.image_processor.image_mean,
+ self.clip_preprocess.image_processor.image_std,
+ ),
+ ]
+ )(pil_image).unsqueeze(0)
with paddle.no_grad():
caption = self.blip_model.generate(
@@ -157,18 +158,18 @@ def generate_caption(self, pil_image: Image) -> str:
max_length=self.config.blip_max_length,
min_length=self.config.blip_min_length,
top_p=self.config.blip_top_p,
- repetition_penalty=self.config.blip_repetition_penalty, )
+ repetition_penalty=self.config.blip_repetition_penalty,
+ )
return caption[0]
def image_to_features(self, image: Image) -> paddle.Tensor:
images = self.clip_preprocess(images=image, return_tensors="pd")
with paddle.no_grad():
- image_features = self.clip_model.get_image_features(images[
- "pixel_values"])
+ image_features = self.clip_model.get_image_features(images["pixel_values"])
image_features /= image_features.norm(axis=-1, keepdim=True)
return image_features
- def interrogate_classic(self, image: Image, max_flavors: int=3) -> str:
+ def interrogate_classic(self, image: Image, max_flavors: int = 3) -> str:
caption = self.generate_caption(image)
image_features = self.image_to_features(image)
@@ -185,25 +186,21 @@ def interrogate_classic(self, image: Image, max_flavors: int=3) -> str:
return _truncate_to_fit(prompt, self.tokenize, self.pad_token_id)
- def interrogate_fast(self, image: Image, max_flavors: int=32) -> str:
+ def interrogate_fast(self, image: Image, max_flavors: int = 32) -> str:
caption = self.generate_caption(image)
image_features = self.image_to_features(image)
merged = _merge_tables(
- [
- self.artists, self.flavors, self.mediums, self.movements,
- self.trendings
- ],
- self.config, )
+ [self.artists, self.flavors, self.mediums, self.movements, self.trendings],
+ self.config,
+ )
tops = merged.rank(image_features, max_flavors)
- return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize,
- self.pad_token_id)
+ return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize, self.pad_token_id)
- def interrogate(self, image: Image, max_flavors: int=32) -> str:
+ def interrogate(self, image: Image, max_flavors: int = 32) -> str:
caption = self.generate_caption(image)
image_features = self.image_to_features(image)
- flaves = self.flavors.rank(image_features,
- self.config.flavor_intermediate_count)
+ flaves = self.flavors.rank(image_features, self.config.flavor_intermediate_count)
best_medium = self.mediums.rank(image_features, 1)[0]
best_artist = self.artists.rank(image_features, 1)[0]
best_trending = self.trendings.rank(image_features, 1)[0]
@@ -225,65 +222,50 @@ def check(addition: str) -> bool:
def check_multi_batch(opts: List[str]):
nonlocal best_prompt, best_sim
prompts = []
- for i in range(2**len(opts)):
+ for i in range(2 ** len(opts)):
prompt = best_prompt
for bit in range(len(opts)):
if i & (1 << bit):
prompt += ", " + opts[bit]
prompts.append(prompt)
- t = LabelTable(prompts, None, self.clip_model, self.tokenize,
- self.config)
+ t = LabelTable(prompts, None, self.clip_model, self.tokenize, self.config)
best_prompt = t.rank(image_features, 1)[0]
best_sim = self.similarity(image_features, best_prompt)
- check_multi_batch(
- [best_medium, best_artist, best_trending, best_movement])
+ check_multi_batch([best_medium, best_artist, best_trending, best_movement])
extended_flavors = set(flaves)
- for i in tqdm(
- range(max_flavors), desc="Flavor chain",
- disable=self.config.quiet):
- best = self.rank_top(
- image_features,
- [f"{best_prompt}, {f}" for f in extended_flavors])
- flave = best[len(best_prompt) + 2:]
+ for i in tqdm(range(max_flavors), desc="Flavor chain", disable=self.config.quiet):
+ best = self.rank_top(image_features, [f"{best_prompt}, {f}" for f in extended_flavors])
+ flave = best[len(best_prompt) + 2 :]
if not check(flave):
break
- if _prompt_at_max_len(best_prompt, self.tokenize,
- self.pad_token_id):
+ if _prompt_at_max_len(best_prompt, self.tokenize, self.pad_token_id):
break
extended_flavors.remove(flave)
return best_prompt
- def rank_top(self, image_features: paddle.Tensor,
- text_array: List[str]) -> str:
+ def rank_top(self, image_features: paddle.Tensor, text_array: List[str]) -> str:
text_tokens = self.tokenize(text_array)
with paddle.no_grad():
- text_features = self.clip_model.get_text_features(text_tokens[
- "input_ids"])
+ text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
text_features /= text_features.norm(axis=-1, keepdim=True)
- similarity = text_features @image_features.T
+ similarity = text_features @ image_features.T
return text_array[similarity.argmax().item()]
def similarity(self, image_features: paddle.Tensor, text: str) -> float:
text_tokens = self.tokenize([text])
with paddle.no_grad():
- text_features = self.clip_model.get_text_features(text_tokens[
- "input_ids"])
+ text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
text_features /= text_features.norm(axis=-1, keepdim=True)
- similarity = text_features @image_features.T
+ similarity = text_features @ image_features.T
return similarity[0][0].item()
class LabelTable:
- def __init__(self,
- labels: List[str],
- desc: str,
- clip_model,
- tokenize,
- config: Config):
+ def __init__(self, labels: List[str], desc: str, clip_model, tokenize, config: Config):
self.chunk_size = config.chunk_size
self.config = config
self.embeds = []
@@ -295,10 +277,8 @@ def __init__(self,
cache_filepath = None
if config.cache_path is not None and desc is not None:
os.makedirs(config.cache_path, exist_ok=True)
- sanitized_name = config.clip_pretrained_model_name_or_path.replace(
- "/", "_").replace("@", "_")
- cache_filepath = os.path.join(config.cache_path,
- f"{sanitized_name}_{desc}.pkl")
+ sanitized_name = config.clip_pretrained_model_name_or_path.replace("/", "_").replace("@", "_")
+ cache_filepath = os.path.join(config.cache_path, f"{sanitized_name}_{desc}.pkl")
if desc is not None and os.path.exists(cache_filepath):
with open(cache_filepath, "rb") as f:
try:
@@ -311,16 +291,15 @@ def __init__(self,
if len(self.labels) != len(self.embeds):
self.embeds = []
- chunks = np.array_split(
- self.labels, max(1, len(self.labels) / config.chunk_size))
+ chunks = np.array_split(self.labels, max(1, len(self.labels) / config.chunk_size))
for chunk in tqdm(
- chunks,
- desc=f"Preprocessing {desc}" if desc else None,
- disable=self.config.quiet, ):
+ chunks,
+ desc=f"Preprocessing {desc}" if desc else None,
+ disable=self.config.quiet,
+ ):
text_tokens = self.tokenize(chunk.tolist())
with paddle.no_grad():
- text_features = clip_model.get_text_features(text_tokens[
- "input_ids"])
+ text_features = clip_model.get_text_features(text_tokens["input_ids"])
text_features /= text_features.norm(axis=-1, keepdim=True)
text_features = text_features.cpu().numpy()
for i in range(text_features.shape[0]):
@@ -335,22 +314,23 @@ def __init__(self,
"hash": hash,
"model": config.clip_pretrained_model_name_or_path,
},
- f, )
+ f,
+ )
def _rank(
- self,
- image_features: paddle.Tensor,
- text_embeds: paddle.Tensor,
- top_count: int=1, ) -> str:
+ self,
+ image_features: paddle.Tensor,
+ text_embeds: paddle.Tensor,
+ top_count: int = 1,
+ ) -> str:
top_count = min(top_count, len(text_embeds))
text_embeds = paddle.to_tensor(text_embeds)
- similarity = image_features @text_embeds.T
+ similarity = image_features @ text_embeds.T
_, top_labels = similarity.cast("float32").topk(top_count, axis=-1)
top_labels = top_labels.tolist()
return [top_labels[0][i] for i in range(top_count)]
- def rank(self, image_features: paddle.Tensor,
- top_count: int=1) -> List[str]:
+ def rank(self, image_features: paddle.Tensor, top_count: int = 1) -> List[str]:
if len(self.labels) <= self.chunk_size:
tops = self._rank(image_features, self.embeds, top_count=top_count)
return [self.labels[i] for i in tops]
@@ -362,10 +342,7 @@ def rank(self, image_features: paddle.Tensor,
for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
start = chunk_idx * self.chunk_size
stop = min(start + self.chunk_size, len(self.embeds))
- tops = self._rank(
- image_features,
- self.embeds[start:stop],
- top_count=keep_per_chunk)
+ tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk)
top_labels.extend([self.labels[start + i] for i in tops])
top_embeds.extend([self.embeds[start + i] for i in tops])
@@ -374,11 +351,7 @@ def rank(self, image_features: paddle.Tensor,
def _load_list(data_path: str, filename: str) -> List[str]:
- with open(
- os.path.join(data_path, filename),
- "r",
- encoding="utf-8",
- errors="replace") as f:
+ with open(os.path.join(data_path, filename), "r", encoding="utf-8", errors="replace") as f:
items = [line.strip() for line in f.readlines()]
return items
@@ -391,7 +364,7 @@ def _merge_tables(tables: List[LabelTable], config: Config) -> LabelTable:
return m
-def _prompt_at_max_len(text: str, tokenize, pad_token_id: int=0) -> bool:
+def _prompt_at_max_len(text: str, tokenize, pad_token_id: int = 0) -> bool:
tokens = tokenize([text])["input_ids"]
return tokens[0][-1] != pad_token_id
diff --git a/ppdiffusers/examples/clip_interrogator/dumpy.py b/ppdiffusers/examples/clip_interrogator/dumpy.py
index 9a6e930b2e198..552e84eae5944 100644
--- a/ppdiffusers/examples/clip_interrogator/dumpy.py
+++ b/ppdiffusers/examples/clip_interrogator/dumpy.py
@@ -14,9 +14,12 @@
# limitations under the License.
import gradio as gr
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
- Interrogator)
+from clip_interrogator import (
+ BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Config,
+ Interrogator,
+)
blip_pretrained_model_name_or_path = "Salesforce/blip-image-captioning-base"
clip_pretrained_model_name_or_path = "openai/clip-vit-large-patch14"
@@ -38,16 +41,18 @@
config = Config(
blip_num_beams=64,
blip_pretrained_model_name_or_path=blip_pretrained_model_name_or_path,
- clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path, )
+ clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path,
+)
ci = Interrogator(config)
def inference(image, mode, best_max_flavors=32):
- ci.config.chunk_size = (2048 if ci.config.clip_pretrained_model_name_or_path
- == "openai/clip-vit-large-patch14" else 1024)
+ ci.config.chunk_size = (
+ 2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
+ )
ci.config.flavor_intermediate_count = (
- 2048 if ci.config.clip_pretrained_model_name_or_path ==
- "openai/clip-vit-large-patch14" else 1024)
+ 2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
+ )
image = image.convert("RGB")
if mode == "best":
return ci.interrogate(image, max_flavors=int(best_max_flavors))
@@ -59,16 +64,17 @@ def inference(image, mode, best_max_flavors=32):
inputs = [
gr.inputs.Image(type="pil"),
- gr.Radio(
- ["best", "fast", "classic"], label="", value="best"),
- gr.Number(
- value=16, label="best mode max flavors"),
+ gr.Radio(["best", "fast", "classic"], label="", value="best"),
+ gr.Number(value=16, label="best mode max flavors"),
+]
+outputs = [
+ gr.outputs.Textbox(label="Output"),
]
-outputs = [gr.outputs.Textbox(label="Output"), ]
io = gr.Interface(
inference,
inputs,
outputs,
- allow_flagging=False, )
+ allow_flagging=False,
+)
io.launch(debug=False, server_name="0.0.0.0", server_port=8586)
diff --git a/ppdiffusers/examples/clip_interrogator/predict.py b/ppdiffusers/examples/clip_interrogator/predict.py
index d42d5a666a53c..bb6dd5f6004b7 100644
--- a/ppdiffusers/examples/clip_interrogator/predict.py
+++ b/ppdiffusers/examples/clip_interrogator/predict.py
@@ -15,9 +15,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
- Interrogator)
+from clip_interrogator import (
+ BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Config,
+ Interrogator,
+)
from cog import BasePredictor, Input, Path
from PIL import Image
@@ -28,29 +31,32 @@ def setup(self):
Config(
blip_pretrained_model_name_or_path="Salesforce/blip-image-captioning-large",
clip_pretrained_model_name_or_path="openai/clip-vit-large-patch14",
- device="gpu", ))
+ device="gpu",
+ )
+ )
def predict(
- self,
- image: Path=Input(description="Input image"),
- clip_pretrained_model_name_or_path: str=Input(
- default="openai/clip-vit-large-patch14",
- choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2",
- ),
- blip_pretrained_model_name_or_path: str=Input(
- default="Salesforce/blip-image-captioning-large",
- choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- description="Choose Salesforce/blip-image-captioning-large", ),
- mode: str=Input(
- default="best",
- choices=["best", "classic", "fast"],
- description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).",
- ), ) -> str:
+ self,
+ image: Path = Input(description="Input image"),
+ clip_pretrained_model_name_or_path: str = Input(
+ default="openai/clip-vit-large-patch14",
+ choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2",
+ ),
+ blip_pretrained_model_name_or_path: str = Input(
+ default="Salesforce/blip-image-captioning-large",
+ choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ description="Choose Salesforce/blip-image-captioning-large",
+ ),
+ mode: str = Input(
+ default="best",
+ choices=["best", "classic", "fast"],
+ description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).",
+ ),
+ ) -> str:
"""Run a single prediction on the model"""
image = Image.open(str(image)).convert("RGB")
- self.switch_model(clip_pretrained_model_name_or_path,
- blip_pretrained_model_name_or_path)
+ self.switch_model(clip_pretrained_model_name_or_path, blip_pretrained_model_name_or_path)
if mode == "best":
return self.ci.interrogate(image)
elif mode == "classic":
@@ -59,16 +65,13 @@ def predict(
return self.ci.interrogate_fast(image)
def switch_model(
- self,
- clip_pretrained_model_name_or_path: str,
- blip_pretrained_model_name_or_path: str, ):
- if (clip_pretrained_model_name_or_path !=
- self.ci.config.clip_pretrained_model_name_or_path):
- self.ci.config.clip_pretrained_model_name_or_path = (
- clip_pretrained_model_name_or_path)
+ self,
+ clip_pretrained_model_name_or_path: str,
+ blip_pretrained_model_name_or_path: str,
+ ):
+ if clip_pretrained_model_name_or_path != self.ci.config.clip_pretrained_model_name_or_path:
+ self.ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
self.ci.load_clip_model()
- if (blip_pretrained_model_name_or_path !=
- self.ci.config.blip_pretrained_model_name_or_path):
- self.ci.config.blip_pretrained_model_name_or_path = (
- blip_pretrained_model_name_or_path)
+ if blip_pretrained_model_name_or_path != self.ci.config.blip_pretrained_model_name_or_path:
+ self.ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
self.ci.load_blip_model()
diff --git a/ppdiffusers/examples/clip_interrogator/run_cli.py b/ppdiffusers/examples/clip_interrogator/run_cli.py
index 081717fcf915d..c905195af03f8 100755
--- a/ppdiffusers/examples/clip_interrogator/run_cli.py
+++ b/ppdiffusers/examples/clip_interrogator/run_cli.py
@@ -21,9 +21,12 @@
import paddle
import requests
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
- Interrogator)
+from clip_interrogator import (
+ BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Config,
+ Interrogator,
+)
from PIL import Image
@@ -44,18 +47,16 @@ def main():
"--clip",
default="openai/clip-vit-large-patch14",
choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- help="name of CLIP model to use", )
+ help="name of CLIP model to use",
+ )
parser.add_argument(
"-b",
"--blip",
default="Salesforce/blip-image-captioning-large",
choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- help="name of BLIP model to use", )
- parser.add_argument(
- "-d",
- "--device",
- default="auto",
- help="device to use (auto, gpu or cpu)")
+ help="name of BLIP model to use",
+ )
+ parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
parser.add_argument("-f", "--folder", help="path to folder of images")
parser.add_argument("-i", "--image", help="image file or url")
parser.add_argument(
@@ -63,7 +64,8 @@ def main():
"--mode",
default="best",
choices=["best", "classic", "fast"],
- help="best, classic, or fast", )
+ help="best, classic, or fast",
+ )
args = parser.parse_args()
if not args.folder and not args.image:
@@ -71,8 +73,7 @@ def main():
exit(1)
if args.folder is not None and args.image is not None:
- print(
- "Specify a folder or batch processing or a single image, not both")
+ print("Specify a folder or batch processing or a single image, not both")
exit(1)
# validate clip model name
@@ -98,16 +99,15 @@ def main():
# generate a nice prompt
config = Config(
clip_pretrained_model_name_or_path=args.clip,
- blip_pretrained_model_name_or_path=args.blip, )
+ blip_pretrained_model_name_or_path=args.blip,
+ )
ci = Interrogator(config)
# process single image
if args.image is not None:
image_path = args.image
- if str(image_path).startswith("http://") or str(image_path).startswith(
- "https://"):
- image = Image.open(requests.get(image_path, stream=True)
- .raw).convert("RGB")
+ if str(image_path).startswith("http://") or str(image_path).startswith("https://"):
+ image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
else:
image = Image.open(image_path).convert("RGB")
if not image:
@@ -121,10 +121,7 @@ def main():
print(f"The folder {args.folder} does not exist!")
exit(1)
- files = [
- f for f in os.listdir(args.folder)
- if f.endswith(".jpg") or f.endswith(".png")
- ]
+ files = [f for f in os.listdir(args.folder) if f.endswith(".jpg") or f.endswith(".png")]
prompts = []
for file in files:
image = Image.open(os.path.join(args.folder, file)).convert("RGB")
@@ -140,9 +137,7 @@ def main():
for file, prompt in zip(files, prompts):
w.writerow([file, prompt])
- print(
- f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!"
- )
+ print(f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!")
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/clip_interrogator/run_gradio.py b/ppdiffusers/examples/clip_interrogator/run_gradio.py
index 435c7c46a265b..60c35b66fe030 100755
--- a/ppdiffusers/examples/clip_interrogator/run_gradio.py
+++ b/ppdiffusers/examples/clip_interrogator/run_gradio.py
@@ -19,9 +19,12 @@
import gradio as gr
import paddle
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
- Interrogator)
+from clip_interrogator import (
+ BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+ Config,
+ Interrogator,
+)
parser = argparse.ArgumentParser()
parser.add_argument(
@@ -29,19 +32,18 @@
"--clip",
default="openai/clip-vit-large-patch14",
choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- help="name of CLIP model to use", )
+ help="name of CLIP model to use",
+)
parser.add_argument(
"-b",
"--blip",
default="Salesforce/blip-image-captioning-large",
choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
- help="name of BLIP model to use", )
-parser.add_argument(
- "-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
-parser.add_argument(
- "-s", "--share", action="store_true", help="Create a public link")
-parser.add_argument(
- "--server_name", default="0.0.0.0", type=str, help="server_name")
+ help="name of BLIP model to use",
+)
+parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
+parser.add_argument("-s", "--share", action="store_true", help="Create a public link")
+parser.add_argument("--server_name", default="0.0.0.0", type=str, help="server_name")
parser.add_argument("--server_port", default=8586, type=int, help="server_port")
args = parser.parse_args()
@@ -69,31 +71,29 @@
config = Config(
cache_path="cache",
clip_pretrained_model_name_or_path=args.clip,
- blip_pretrained_model_name_or_path=args.blip, )
+ blip_pretrained_model_name_or_path=args.blip,
+)
ci = Interrogator(config)
def inference(
- image,
- mode,
- clip_pretrained_model_name_or_path,
- blip_pretrained_model_name_or_path,
- blip_min_length,
- blip_max_length,
- blip_sample,
- blip_top_p,
- blip_repetition_penalty,
- blip_num_beams, ):
- if (clip_pretrained_model_name_or_path !=
- ci.config.clip_pretrained_model_name_or_path):
- ci.config.clip_pretrained_model_name_or_path = (
- clip_pretrained_model_name_or_path)
+ image,
+ mode,
+ clip_pretrained_model_name_or_path,
+ blip_pretrained_model_name_or_path,
+ blip_min_length,
+ blip_max_length,
+ blip_sample,
+ blip_top_p,
+ blip_repetition_penalty,
+ blip_num_beams,
+):
+ if clip_pretrained_model_name_or_path != ci.config.clip_pretrained_model_name_or_path:
+ ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
ci.load_clip_model()
- if (blip_pretrained_model_name_or_path !=
- ci.config.blip_pretrained_model_name_or_path):
- ci.config.blip_pretrained_model_name_or_path = (
- blip_pretrained_model_name_or_path)
+ if blip_pretrained_model_name_or_path != ci.config.blip_pretrained_model_name_or_path:
+ ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
ci.load_blip_model()
ci.config.blip_min_length = int(blip_min_length)
@@ -114,36 +114,25 @@ def inference(
inputs = [
gr.inputs.Image(type="pil"),
- gr.Radio(
- ["best", "classic", "fast"], label="Mode", value="fast"),
- gr.Dropdown(
- CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip,
- label="CLIP Model"),
- gr.Dropdown(
- BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip,
- label="BLIP Model"),
- gr.Number(
- value=8, label="Caption min Length"),
- gr.Number(
- value=32, label="Caption Max Length"),
- gr.Radio(
- ["True", "False"], value="False", label="Sample or not?"),
- gr.Number(
- value=0.9, label="TopP value, when Sample is true"),
- gr.Number(
- value=1.1, label="Repetition penalty value, when Sample is false"),
- gr.Number(
- value=64, label="Caption Num Beams, when Sample is false"),
+ gr.Radio(["best", "classic", "fast"], label="Mode", value="fast"),
+ gr.Dropdown(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip, label="CLIP Model"),
+ gr.Dropdown(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip, label="BLIP Model"),
+ gr.Number(value=8, label="Caption min Length"),
+ gr.Number(value=32, label="Caption Max Length"),
+ gr.Radio(["True", "False"], value="False", label="Sample or not?"),
+ gr.Number(value=0.9, label="TopP value, when Sample is true"),
+ gr.Number(value=1.1, label="Repetition penalty value, when Sample is false"),
+ gr.Number(value=64, label="Caption Num Beams, when Sample is false"),
+]
+outputs = [
+ gr.outputs.Textbox(label="Image Caption Output"),
]
-outputs = [gr.outputs.Textbox(label="Image Caption Output"), ]
io = gr.Interface(
inference,
inputs,
outputs,
title="🕵️♂️ Paddle CLIP Interrogator 🕵️♂️",
- allow_flagging=False, )
-io.launch(
- share=args.share,
- server_name=args.server_name,
- server_port=args.server_port)
+ allow_flagging=False,
+)
+io.launch(share=args.share, server_name=args.server_name, server_port=args.server_port)
diff --git a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
index 3ef59efaf907f..f4495bba5b6f4 100644
--- a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -20,18 +20,30 @@
import paddle.nn.functional as F
import PIL
from einops import rearrange
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel,
- CLIPTextModel, CLIPTokenizer)
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPModel,
+ CLIPTextModel,
+ CLIPTokenizer,
+)
from tqdm import tqdm
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline,
- DPMSolverMultistepScheduler, LMSDiscreteScheduler,
- PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.loaders import FromCkptMixin
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+ StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.utils import PIL_INTERPOLATION, logging, randn_tensor
logger = logging.get_logger(__name__)
@@ -43,11 +55,7 @@ def preprocess(image, w, h):
elif isinstance(image, PIL.Image.Image):
image = [image]
if isinstance(image[0], PIL.Image.Image):
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -82,11 +90,12 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
def spherical_dist_loss(x, y):
x = F.normalize(x=x, axis=-1)
y = F.normalize(x=y, axis=-1)
- return (paddle.divide(
- (x - y).norm(axis=-1), paddle.to_tensor(
- 2, dtype=x.dtype)).asin().pow(y=paddle.to_tensor(
- 2, dtype=x.dtype)).multiply(y=paddle.to_tensor(
- 2, dtype=x.dtype)))
+ return (
+ paddle.divide((x - y).norm(axis=-1), paddle.to_tensor(2, dtype=x.dtype))
+ .asin()
+ .pow(y=paddle.to_tensor(2, dtype=x.dtype))
+ .multiply(y=paddle.to_tensor(2, dtype=x.dtype))
+ )
def set_requires_grad(model, value):
@@ -97,20 +106,25 @@ def set_requires_grad(model, value):
class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, FromCkptMixin):
# _optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- clip_model: CLIPModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler,
- DPMSolverMultistepScheduler, ],
- feature_extractor: CLIPFeatureExtractor,
- safety_checker: StableDiffusionSafetyChecker,
- blip_model=None,
- blip_processor=None,
- clip_interrogator=None,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ clip_model: CLIPModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ feature_extractor: CLIPFeatureExtractor,
+ safety_checker: StableDiffusionSafetyChecker,
+ blip_model=None,
+ blip_processor=None,
+ clip_interrogator=None,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -139,18 +153,21 @@ def __init__(
blip_model=blip_model,
blip_processor=blip_processor,
clip_interrogator=clip_interrogator,
- safety_checker=safety_checker, )
+ safety_checker=safety_checker,
+ )
self.feature_extractor_size = (
- feature_extractor.size if isinstance(feature_extractor.size, int)
- else feature_extractor.size["shortest_edge"])
+ feature_extractor.size
+ if isinstance(feature_extractor.size, int)
+ else feature_extractor.size["shortest_edge"]
+ )
self.normalize = paddle.vision.transforms.Normalize(
- mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+ mean=feature_extractor.image_mean, std=feature_extractor.image_std
+ )
set_requires_grad(self.text_encoder, False)
set_requires_grad(self.clip_model, False)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
if slice_size == "auto":
slice_size = self.unet.config.attention_head_dim // 2
self.unet.set_attention_slice(slice_size)
@@ -171,46 +188,35 @@ def unfreeze_unet(self):
set_requires_grad(self.unet, True)
def get_timesteps(self, num_inference_steps, strength):
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
return timesteps, num_inference_steps - t_start
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, dtype, generator=None):
if not isinstance(image, paddle.Tensor):
- raise ValueError(
- f"`image` has to be of type `torch.Tensor` but is {type(image)}")
+ raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}")
image = image.cast(dtype)
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(x=init_latents, axis=0)
else:
init_latents = self.vae.encode(image).latent_dist.sample(generator)
init_latents = 0.18215 * init_latents
- init_latents = init_latents.repeat_interleave(
- repeats=batch_size, axis=0)
- noise = randn_tensor(
- init_latents.shape, generator=generator, dtype=dtype)
+ init_latents = init_latents.repeat_interleave(repeats=batch_size, axis=0)
+ noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
# get latents
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -223,63 +229,53 @@ def get_image_description(self, image):
else:
# with paddle.no_grad(), paddle.amp.auto_cast():
inputs = self.blip_processor(images=image, return_tensors="pd")
- inputs["pixel_values"] = inputs["pixel_values"].cast(
- self.blip_model.dtype)
+ inputs["pixel_values"] = inputs["pixel_values"].cast(self.blip_model.dtype)
# out = self.blip_model.generate(**inputs, decode_strategy="beam_search", num_beams=2, length_penalty=0, max_length=5)
out = self.blip_model.generate(**inputs)
- return self.blip_processor.decode(
- out[0][0], skip_special_tokens=True)
+ return self.blip_processor.decode(out[0][0], skip_special_tokens=True)
def get_clip_image_embeddings(self, image, batch_size):
clip_image_input = self.feature_extractor.preprocess(image)
clip_image_features = (
- paddle.to_tensor(data=clip_image_input["pixel_values"][0])
- .unsqueeze(axis=0).astype(dtype="float16"))
- image_embeddings_clip = self.clip_model.get_image_features(
- clip_image_features)
- image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
- p=2, axis=-1, keepdim=True)
- image_embeddings_clip = image_embeddings_clip.repeat_interleave(
- repeats=batch_size, axis=0)
+ paddle.to_tensor(data=clip_image_input["pixel_values"][0]).unsqueeze(axis=0).astype(dtype="float16")
+ )
+ image_embeddings_clip = self.clip_model.get_image_features(clip_image_features)
+ image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
+ image_embeddings_clip = image_embeddings_clip.repeat_interleave(repeats=batch_size, axis=0)
return image_embeddings_clip
@paddle.enable_grad()
def cond_fn(
- self,
- latents,
- timestep,
- index,
- text_embeddings,
- noise_pred_original,
- original_image_embeddings_clip,
- clip_guidance_scale, ):
+ self,
+ latents,
+ timestep,
+ index,
+ text_embeddings,
+ noise_pred_original,
+ original_image_embeddings_clip,
+ clip_guidance_scale,
+ ):
out_0 = latents.detach()
out_0.stop_gradient = not True
latents = out_0
latent_model_input = self.scheduler.scale_model_input(latents, timestep)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, timestep,
- encoder_hidden_states=text_embeddings).sample
- if isinstance(
- self.scheduler,
- (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
+ noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+ if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
beta_prod_t = 1 - alpha_prod_t
# compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_original_sample = (
- latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5
+ pred_original_sample = (latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5
fac = paddle.sqrt(x=beta_prod_t)
sample = pred_original_sample * fac + latents * (1 - fac)
elif isinstance(self.scheduler, LMSDiscreteScheduler):
sigma = self.scheduler.sigmas[index]
sample = latents - sigma * noise_pred
else:
- raise ValueError(
- f"scheduler type {type(self.scheduler)} not supported")
+ raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
# Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
sample = 1 / 0.18215 * sample
@@ -289,56 +285,48 @@ def cond_fn(
# image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
c_size = image.shape[0]
image = rearrange(image, "c t h w -> (c t) h w")
- image = paddle.vision.transforms.Resize(self.feature_extractor_size)(
- image)
+ image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
image = rearrange(image, "(c t) h w -> c t h w", c=c_size)
image = self.normalize(image)
image_embeddings_clip = self.clip_model.get_image_features(image)
- image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
- p=2, axis=-1, keepdim=True)
- loss = (spherical_dist_loss(image_embeddings_clip,
- original_image_embeddings_clip).mean() *
- clip_guidance_scale)
+ image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
+ loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale
grads = -paddle.autograd.grad(loss, latents)[0]
if isinstance(self.scheduler, LMSDiscreteScheduler):
latents = latents.detach() + grads * sigma**2
noise_pred = noise_pred_original
else:
- noise_pred = noise_pred_original - paddle.sqrt(
- x=beta_prod_t) * grads
+ noise_pred = noise_pred_original - paddle.sqrt(x=beta_prod_t) * grads
return noise_pred, latents
@paddle.no_grad()
def __call__(
- self,
- style_image: Union[paddle.Tensor, PIL.Image.Image],
- content_image: Union[paddle.Tensor, PIL.Image.Image],
- style_prompt: Optional[str]=None,
- content_prompt: Optional[str]=None,
- negative_prompt=None,
- height: Optional[int]=512,
- width: Optional[int]=512,
- noise_strength: float=0.6,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- batch_size: Optional[int]=1,
- eta: float=0.0,
- clip_guidance_scale: Optional[float]=100,
- generator: Optional[paddle.Generator]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- slerp_latent_style_strength: float=0.8,
- slerp_prompt_style_strength: float=0.1,
- slerp_clip_image_style_strength: float=0.1, ):
+ self,
+ style_image: Union[paddle.Tensor, PIL.Image.Image],
+ content_image: Union[paddle.Tensor, PIL.Image.Image],
+ style_prompt: Optional[str] = None,
+ content_prompt: Optional[str] = None,
+ negative_prompt=None,
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ noise_strength: float = 0.6,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ batch_size: Optional[int] = 1,
+ eta: float = 0.0,
+ clip_guidance_scale: Optional[float] = 100,
+ generator: Optional[paddle.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ slerp_latent_style_strength: float = 0.8,
+ slerp_prompt_style_strength: float = 0.1,
+ slerp_clip_image_style_strength: float = 0.1,
+ ):
if isinstance(generator, list) and len(generator) != batch_size:
- raise ValueError(
- f"You have passed {batch_size} batch_size, but only {len(generator)} generators."
- )
+ raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# generate prompts with blip model if prompt is
if content_prompt is None:
@@ -353,35 +341,32 @@ def __call__(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
- content_text_embeddings = self.text_encoder(
- content_text_input.input_ids)[0]
+ return_tensors="pd",
+ )
+ content_text_embeddings = self.text_encoder(content_text_input.input_ids)[0]
style_text_input = self.tokenizer(
style_prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
style_text_embeddings = self.text_encoder(style_text_input.input_ids)[0]
- text_embeddings = slerp(slerp_prompt_style_strength,
- content_text_embeddings, style_text_embeddings)
+ text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings)
# duplicate text embeddings for each generation per prompt
- text_embeddings = text_embeddings.repeat_interleave(
- repeats=batch_size, axis=0)
+ text_embeddings = text_embeddings.repeat_interleave(repeats=batch_size, axis=0)
# set timesteps
- accepts_offset = "offset" in set(
- inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
extra_set_kwargs = {}
if accepts_offset:
extra_set_kwargs["offset"] = 1
self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
# Some schedulers like PNDM have timesteps as arrays
# It's more optimized to move all timesteps to correct device beforehand
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- noise_strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength)
latent_timestep = timesteps[:1].tile(repeat_times=[batch_size])
# Preprocess image
@@ -391,25 +376,25 @@ def __call__(
latent_timestep,
batch_size,
text_embeddings.dtype,
- generator, )
+ generator,
+ )
preprocessed_style_image = preprocess(style_image, width, height)
style_latents = self.prepare_latents(
preprocessed_style_image,
latent_timestep,
batch_size,
text_embeddings.dtype,
- generator, )
- latents = slerp(slerp_latent_style_strength, content_latents,
- style_latents)
+ generator,
+ )
+ latents = slerp(slerp_latent_style_strength, content_latents, style_latents)
if clip_guidance_scale > 0:
- content_clip_image_embedding = self.get_clip_image_embeddings(
- content_image, batch_size)
- style_clip_image_embedding = self.get_clip_image_embeddings(
- style_image, batch_size)
+ content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size)
+ style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size)
clip_image_embeddings = slerp(
slerp_clip_image_style_strength,
content_clip_image_embedding,
- style_clip_image_embedding, )
+ style_clip_image_embedding,
+ )
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -426,17 +411,16 @@ def __call__(
uncond_tokens,
padding="max_length",
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
# duplicate unconditional embeddings for each generation per prompt
- uncond_embeddings = uncond_embeddings.repeat_interleave(
- repeats=batch_size, axis=0)
+ uncond_embeddings = uncond_embeddings.repeat_interleave(repeats=batch_size, axis=0)
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- x=[uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat(x=[uncond_embeddings, text_embeddings])
# get the initial random noise unless the user supplied it
@@ -451,13 +435,10 @@ def __call__(
]
latents_dtype = text_embeddings.dtype
if latents is None:
- latents = paddle.randn(
- shape=latents_shape, generator=generator, dtype=latents_dtype)
+ latents = paddle.randn(shape=latents_shape, generator=generator, dtype=latents_dtype)
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
@@ -466,41 +447,34 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
# with self.progress_bar(total=num_inference_steps):
for i, t in tqdm(enumerate(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(x=[latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform classifier free guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# perform clip guidance
if clip_guidance_scale > 0:
text_embeddings_for_guidance = (
- text_embeddings.chunk(chunks=2)[1]
- if do_classifier_free_guidance else text_embeddings)
+ text_embeddings.chunk(chunks=2)[1] if do_classifier_free_guidance else text_embeddings
+ )
noise_pred, latents = self.cond_fn(
latents,
t,
@@ -508,23 +482,21 @@ def __call__(
text_embeddings_for_guidance,
noise_pred,
clip_image_embeddings,
- clip_guidance_scale, )
+ clip_guidance_scale,
+ )
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
latents = 1 / 0.18215 * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clip(min=0, max=1)
image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
- image, has_nsfw_concept = self.run_safety_checker(image,
- text_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
return image, None
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
index ee8e0cac04537..f23f5d60b2eee 100644
--- a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
+++ b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
@@ -20,14 +20,22 @@
from paddle import nn
from paddle.nn import functional as F
from paddle.vision import transforms
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel,
- CLIPTextModel, CLIPTokenizer)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline,
- LMSDiscreteScheduler, PNDMScheduler,
- UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPModel,
+ CLIPTextModel,
+ CLIPTokenizer,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DiffusionPipeline,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from ppdiffusers.utils import logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -46,13 +54,10 @@ def forward(self, pixel_values, num_cutouts):
min_size = min(sideX, sideY, self.cut_size)
cutouts = []
for _ in range(num_cutouts):
- size = int(
- paddle.rand((1, ))**self.cut_power * (max_size - min_size) +
- min_size)
- offsetx = int(paddle.randint(0, sideX - size + 1, (1, )))
- offsety = int(paddle.randint(0, sideY - size + 1, (1, )))
- cutout = pixel_values[:, :, offsety:offsety + size, offsetx:offsetx
- + size]
+ size = int(paddle.rand((1,)) ** self.cut_power * (max_size - min_size) + min_size)
+ offsetx = int(paddle.randint(0, sideX - size + 1, (1,)))
+ offsety = int(paddle.randint(0, sideY - size + 1, (1,)))
+ cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
return paddle.concat(cutouts)
@@ -75,15 +80,15 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- clip_model: CLIPModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[PNDMScheduler, LMSDiscreteScheduler,
- DDIMScheduler],
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ clip_model: CLIPModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
self.register_modules(
vae=vae,
@@ -92,20 +97,21 @@ def __init__(
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
- feature_extractor=feature_extractor, )
-
- self.normalize = transforms.Normalize(
- mean=feature_extractor.image_mean, std=feature_extractor.image_std)
- self.cut_out_size = (feature_extractor.size
- if isinstance(feature_extractor.size, int) else
- feature_extractor.size["shortest_edge"])
+ feature_extractor=feature_extractor,
+ )
+
+ self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+ self.cut_out_size = (
+ feature_extractor.size
+ if isinstance(feature_extractor.size, int)
+ else feature_extractor.size["shortest_edge"]
+ )
self.make_cutouts = MakeCutouts(self.cut_out_size)
set_stop_gradient(self.text_encoder, True)
set_stop_gradient(self.clip_model, True)
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
if slice_size == "auto":
# half the attention head size is usually a good trade-off between
# speed and memory
@@ -128,16 +134,17 @@ def unfreeze_unet(self):
set_stop_gradient(self.unet, False)
def cond_fn(
- self,
- latents,
- timestep,
- index,
- text_embeddings,
- noise_pred_original,
- text_embeddings_clip,
- clip_guidance_scale,
- num_cutouts,
- use_cutouts=True, ):
+ self,
+ latents,
+ timestep,
+ index,
+ text_embeddings,
+ noise_pred_original,
+ text_embeddings_clip,
+ clip_guidance_scale,
+ num_cutouts,
+ use_cutouts=True,
+ ):
# https://github.com/PaddlePaddle/Paddle/issues/54306 in 2.5rc paddle.set_grad_enabled has bug
with paddle.set_grad_enabled(True):
latents = latents.detach()
@@ -146,24 +153,19 @@ def cond_fn(
if isinstance(self.scheduler, LMSDiscreteScheduler):
sigma = self.scheduler.sigmas[index]
# the model input needs to be scaled to match the continuous ODE formulation in K-LMS
- latent_model_input = latents / ((sigma**2 + 1)**0.5)
+ latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
else:
latent_model_input = latents
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- timestep,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
beta_prod_t = 1 - alpha_prod_t
# compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_original_sample = (
- latents - beta_prod_t**
- (0.5) * noise_pred) / alpha_prod_t**(0.5)
+ pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
fac = paddle.sqrt(beta_prod_t)
sample = pred_original_sample * (fac) + latents * (1 - fac)
@@ -171,8 +173,7 @@ def cond_fn(
sigma = self.scheduler.sigmas[index]
sample = latents - sigma * noise_pred
else:
- raise ValueError(
- f"scheduler type {type(self.scheduler)} not supported")
+ raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
sample = 1 / 0.18215 * sample
image = self.vae.decode(sample).sample
@@ -182,23 +183,18 @@ def cond_fn(
image = self.make_cutouts(image, num_cutouts)
else:
resize_transform = transforms.Resize(self.cut_out_size)
- image = paddle.stack(
- [resize_transform(img) for img in image], axis=0)
+ image = paddle.stack([resize_transform(img) for img in image], axis=0)
image = self.normalize(image).astype(latents.dtype)
image_embeddings_clip = self.clip_model.get_image_features(image)
- image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
- p=2, axis=-1, keepdim=True)
+ image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
if use_cutouts:
- dists = spherical_dist_loss(image_embeddings_clip,
- text_embeddings_clip)
+ dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
dists = dists.reshape([num_cutouts, sample.shape[0], -1])
loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
else:
- loss = (spherical_dist_loss(image_embeddings_clip,
- text_embeddings_clip).mean() *
- clip_guidance_scale)
+ loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
grads = -paddle.autograd.grad(loss, latents)[0]
@@ -206,52 +202,49 @@ def cond_fn(
latents = latents.detach() + grads * (sigma**2)
noise_pred = noise_pred_original
else:
- noise_pred = noise_pred_original - paddle.sqrt(
- beta_prod_t) * grads
+ noise_pred = noise_pred_original - paddle.sqrt(beta_prod_t) * grads
return noise_pred, latents
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=512,
- width: Optional[int]=512,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- clip_guidance_scale: Optional[float]=100,
- clip_prompt: Optional[Union[str, List[str]]]=None,
- num_cutouts: Optional[int]=4,
- use_cutouts: Optional[bool]=True,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ clip_guidance_scale: Optional[float] = 100,
+ clip_prompt: Optional[Union[str, List[str]]] = None,
+ num_cutouts: Optional[int] = 4,
+ use_cutouts: Optional[bool] = True,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
if isinstance(prompt, str):
batch_size = 1
elif isinstance(prompt, list):
batch_size = len(prompt)
else:
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# get prompt text embeddings
text_inputs = self.tokenizer(
@@ -259,26 +252,25 @@ def __call__(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
attention_mask = paddle.ones_like(text_input_ids)
- text_embeddings = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)[0]
+ text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
# duplicate text embeddings for each generation per prompt
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if clip_guidance_scale > 0:
if clip_prompt is not None:
@@ -287,19 +279,16 @@ def __call__(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
else:
clip_text_input_ids = text_inputs.input_ids
- text_embeddings_clip = self.clip_model.get_text_features(
- clip_text_input_ids)
- text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(
- p=2, axis=-1, keepdim=True)
+ text_embeddings_clip = self.clip_model.get_text_features(clip_text_input_ids)
+ text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
# duplicate text embeddings clip for each generation per prompt
bs_embed, _ = text_embeddings_clip.shape
- text_embeddings_clip = text_embeddings_clip.tile(
- [1, num_images_per_prompt])
- text_embeddings_clip = text_embeddings_clip.reshape(
- [bs_embed * num_images_per_prompt, -1])
+ text_embeddings_clip = text_embeddings_clip.tile([1, num_images_per_prompt])
+ text_embeddings_clip = text_embeddings_clip.reshape([bs_embed * num_images_per_prompt, -1])
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -313,14 +302,16 @@ def __call__(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -330,23 +321,20 @@ def __call__(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = paddle.ones_like(uncond_input.input_ids)
- uncond_embeddings = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)[0]
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
# duplicate unconditional embeddings for each generation per prompt
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [batch_size, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
# get the initial random noise unless the user supplied it
@@ -360,13 +348,10 @@ def __call__(
width // 8,
]
if latents is None:
- latents = paddle.randn(
- latents_shape, generator=generator, dtype=text_embeddings.dtype)
+ latents = paddle.randn(latents_shape, generator=generator, dtype=text_embeddings.dtype)
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -382,41 +367,34 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform classifier free guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# perform clip guidance
if clip_guidance_scale > 0:
- text_embeddings_for_guidance = (text_embeddings.chunk(2)[1]
- if do_classifier_free_guidance
- else text_embeddings)
+ text_embeddings_for_guidance = (
+ text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+ )
noise_pred, latents = self.cond_fn(
latents,
t,
@@ -426,11 +404,11 @@ def __call__(
text_embeddings_clip,
clip_guidance_scale,
num_cutouts,
- use_cutouts, )
+ use_cutouts,
+ )
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -449,5 +427,4 @@ def __call__(
if not return_dict:
return (image, None)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=None)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/ppdiffusers/examples/community/composable_stable_diffusion.py b/ppdiffusers/examples/community/composable_stable_diffusion.py
index f3ff012a945f0..74e7f3856fdb6 100644
--- a/ppdiffusers/examples/community/composable_stable_diffusion.py
+++ b/ppdiffusers/examples/community/composable_stable_diffusion.py
@@ -16,18 +16,16 @@
from typing import Callable, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import deprecate, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -62,30 +60,26 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
@@ -107,10 +101,10 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
@@ -139,24 +133,25 @@ def disable_attention_slicing(self):
@paddle.no_grad()
def __call__(
- self,
- prompt: str,
- height: Optional[int]=512,
- width: Optional[int]=512,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: str=None,
- # num_images_per_prompt: Optional[int] = 1,
- eta: Optional[float]=0.0,
- seed: Optional[int]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- weights: Optional[str]="",
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- reduce_memory: Optional[bool]=True,
- **kwargs, ):
+ self,
+ prompt: str,
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: str = None,
+ # num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ seed: Optional[int] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ weights: Optional[str] = "",
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ reduce_memory: Optional[bool] = True,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -215,24 +210,20 @@ def __call__(
if isinstance(prompt, str):
batch_size = 1
else:
- raise ValueError(
- f"`prompt` has to be of type `str`but is {type(prompt)}")
+ raise ValueError(f"`prompt` has to be of type `str`but is {type(prompt)}")
if negative_prompt is not None and not isinstance(negative_prompt, str):
- raise ValueError(
- f"`negative_prompt` has to be of type `str`but is {type(prompt)}"
- )
+ raise ValueError(f"`negative_prompt` has to be of type `str`but is {type(prompt)}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if "|" in prompt:
prompt = [x.strip() for x in prompt.split("|")]
@@ -244,19 +235,19 @@ def __call__(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
attention_mask = paddle.ones_like(text_input_ids)
- text_embeddings = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)[0]
+ text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
# duplicate text embeddings for each generation per prompt, using mps friendly method
# bs_embed, seq_len, _ = text_embeddings.shape
@@ -268,20 +259,17 @@ def __call__(
# specify weights for prompts (excluding the unconditional score)
print("using equal weights for all prompts...")
pos_weights = paddle.to_tensor(
- [1 / (text_embeddings.shape[0] - 1)] *
- (text_embeddings.shape[0] - 1)).reshape([-1, 1, 1, 1])
+ [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1)
+ ).reshape([-1, 1, 1, 1])
neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
- mask = paddle.to_tensor(
- [False] + [True] * pos_weights.shape[0], dtype=paddle.bool)
+ mask = paddle.to_tensor([False] + [True] * pos_weights.shape[0], dtype=paddle.bool)
else:
# set prompt weight for each
num_prompts = len(prompt) if isinstance(prompt, list) else 1
weights = [float(w.strip()) for w in weights.split("|")]
if len(weights) < num_prompts:
weights.append(1.0)
- assert (
- len(weights) == text_embeddings.shape[0]
- ), "weights specified are not equal to the number of prompts"
+ assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
pos_weights = []
neg_weights = []
mask = [] # first one is unconditional score
@@ -296,8 +284,7 @@ def __call__(
pos_weights = paddle.to_tensor(pos_weights).reshape([-1, 1, 1, 1])
pos_weights = pos_weights / pos_weights.sum()
if neg_weights:
- neg_weights = paddle.to_tensor(neg_weights).reshape(
- [-1, 1, 1, 1])
+ neg_weights = paddle.to_tensor(neg_weights).reshape([-1, 1, 1, 1])
neg_weights = neg_weights / neg_weights.sum()
mask = paddle.to_tensor(mask, dtype=paddle.bool)
@@ -320,10 +307,10 @@ def __call__(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = paddle.ones_like(uncond_input.input_ids)
- uncond_embeddings = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)[0]
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
# seq_len = uncond_embeddings.shape[1]
@@ -335,31 +322,25 @@ def __call__(
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
# update negative weights
neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
- mask = paddle.to_tensor(
- [False] + mask.tolist(), dtype=paddle.bool)
+ mask = paddle.to_tensor([False] + mask.tolist(), dtype=paddle.bool)
# get the initial random noise unless the user supplied it
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
- latents_shape = [
- batch_size, self.unet.in_channels, height // 8, width // 8
- ]
+ latents_shape = [batch_size, self.unet.in_channels, height // 8, width // 8]
if latents is None:
if seed is not None:
paddle.seed(seed)
latents = paddle.randn(latents_shape, dtype=text_embeddings.dtype)
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -375,8 +356,7 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
@@ -384,47 +364,34 @@ def __call__(
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
latent_model_input = (
- paddle.concat([latents] * text_embeddings.shape[0])
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ paddle.concat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
+ )
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
if reduce_memory:
# reduce memory by predicting each score sequentially
noise_preds = []
# predict the noise residual
for latent_in, text_embedding_in in zip(
- latent_model_input.chunk(
- latent_model_input.shape[0], axis=0),
- text_embeddings.chunk(
- text_embeddings.shape[0], axis=0), ):
- noise_preds.append(
- self.unet(
- latent_in,
- t,
- encoder_hidden_states=text_embedding_in).sample)
+ latent_model_input.chunk(latent_model_input.shape[0], axis=0),
+ text_embeddings.chunk(text_embeddings.shape[0], axis=0),
+ ):
+ noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
noise_preds = paddle.concat(noise_preds, axis=0)
else:
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
mask_index = paddle.nonzero(mask).reshape([-1])
non_mask_index = paddle.nonzero(~mask).reshape([-1])
- noise_pred_uncond = (noise_preds[non_mask_index] *
- neg_weights).sum(axis=0, keepdim=True)
- noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(
- axis=0, keepdim=True)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond = (noise_preds[non_mask_index] * neg_weights).sum(axis=0, keepdim=True)
+ noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(axis=0, keepdim=True)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -441,12 +408,11 @@ def __call__(
# run safety checker
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.astype(
- text_embeddings.dtype), )
+ clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+ )
else:
has_nsfw_concept = None
@@ -456,5 +422,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
index 1a244474fba03..87cee7e93a914 100644
--- a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
+++ b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
@@ -33,15 +33,14 @@ def image_grid(imgs, rows, cols):
def create_clip_guided_pipeline(
- model_id="CompVis/stable-diffusion-v1-4",
- clip_model_id="openai/clip-vit-large-patch14",
- scheduler="plms", ):
- pipeline = StableDiffusionPipeline.from_pretrained(
- model_id, paddle_dtype=paddle.float16)
+ model_id="CompVis/stable-diffusion-v1-4",
+ clip_model_id="openai/clip-vit-large-patch14",
+ scheduler="plms",
+):
+ pipeline = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
if scheduler == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
else:
scheduler = pipeline.scheduler
@@ -55,26 +54,28 @@ def create_clip_guided_pipeline(
text_encoder=pipeline.text_encoder,
scheduler=scheduler,
clip_model=clip_model,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
return guided_pipeline
def infer(
- prompt,
- clip_prompt,
- num_return_images=1,
- num_images_per_prompt=1,
- num_inference_steps=50,
- clip_guidance_scale=100,
- guidance_scale=7.5,
- guided_pipeline=None,
- negative_prompt="",
- use_cutouts=True,
- num_cutouts=4,
- seed=None,
- unfreeze_unet=True,
- unfreeze_vae=True, ):
+ prompt,
+ clip_prompt,
+ num_return_images=1,
+ num_images_per_prompt=1,
+ num_inference_steps=50,
+ clip_guidance_scale=100,
+ guidance_scale=7.5,
+ guided_pipeline=None,
+ negative_prompt="",
+ use_cutouts=True,
+ num_cutouts=4,
+ seed=None,
+ unfreeze_unet=True,
+ unfreeze_vae=True,
+):
clip_prompt = clip_prompt if clip_prompt.strip() != "" else None
if unfreeze_unet:
guided_pipeline.unfreeze_unet()
@@ -98,7 +99,8 @@ def infer(
num_cutouts=num_cutouts,
use_cutouts=use_cutouts,
seed=seed,
- num_images_per_prompt=num_images_per_prompt, ).images
+ num_images_per_prompt=num_images_per_prompt,
+ ).images
images.extend(image)
return image_grid(images, 1, len(images))
@@ -141,6 +143,7 @@ def infer(
num_cutouts=num_cutouts,
seed=seed,
unfreeze_unet=unfreeze_unet,
- unfreeze_vae=unfreeze_vae, )
+ unfreeze_vae=unfreeze_vae,
+ )
display(grid_image)
diff --git a/ppdiffusers/examples/community/interpolate_stable_diffusion.py b/ppdiffusers/examples/community/interpolate_stable_diffusion.py
index 82ed3fbc72ad5..d826aad5ac9fb 100644
--- a/ppdiffusers/examples/community/interpolate_stable_diffusion.py
+++ b/ppdiffusers/examples/community/interpolate_stable_diffusion.py
@@ -20,18 +20,16 @@
import numpy as np
import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import deprecate, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -90,31 +88,27 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
@@ -136,10 +130,10 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
@@ -166,23 +160,24 @@ def disable_attention_slicing(self):
@paddle.no_grad()
def __call__(
- self,
- prompt: Optional[Union[str, List[str]]]=None,
- height: int=512,
- width: int=512,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- text_embeddings: Optional[paddle.Tensor]=None,
- **kwargs, ):
+ self,
+ prompt: Optional[Union[str, List[str]]] = None,
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ text_embeddings: Optional[paddle.Tensor] = None,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
Args:
@@ -240,16 +235,15 @@ def __call__(
"""
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if text_embeddings is None:
if isinstance(prompt, str):
@@ -257,37 +251,33 @@ def __call__(
elif isinstance(prompt, list):
batch_size = len(prompt)
else:
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
# get prompt text embeddings
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
print(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :
- self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
attention_mask = paddle.ones_like(text_input_ids)
- text_embeddings = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)[0]
+ text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
else:
batch_size = text_embeddings.shape[0]
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -301,14 +291,16 @@ def __call__(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -318,23 +310,20 @@ def __call__(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = paddle.ones_like(uncond_input.input_ids)
- uncond_embeddings = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)[0]
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [batch_size, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
# get the initial random noise unless the user supplied it
@@ -349,13 +338,10 @@ def __call__(
]
latents_dtype = text_embeddings.dtype
if latents is None:
- latents = paddle.randn(
- latents_shape, generator=generator, dtype=latents_dtype)
+ latents = paddle.randn(latents_shape, generator=generator, dtype=latents_dtype)
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
latents = latents
# set timesteps
@@ -372,33 +358,26 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -413,12 +392,11 @@ def __call__(
image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.astype(
- text_embeddings.dtype), )
+ clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+ )
else:
has_nsfw_concept = None
@@ -428,8 +406,7 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
def embed_text(self, text):
"""takes in text and turns it into text embeddings"""
@@ -438,7 +415,8 @@ def embed_text(self, text):
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
with paddle.no_grad():
embed = self.text_encoder(text_input.input_ids)[0]
return embed
@@ -448,21 +426,23 @@ def get_noise(self, seed, dtype=paddle.float32, height=512, width=512):
return paddle.randn(
(1, self.unet.in_channels, height // 8, width // 8),
generator=paddle.Generator().manual_seed(seed),
- dtype=dtype, )
+ dtype=dtype,
+ )
def walk(
- self,
- prompts: List[str],
- seeds: List[int],
- num_interpolation_steps: Optional[int]=6,
- output_dir: Optional[str]="./dreams",
- name: Optional[str]=None,
- batch_size: Optional[int]=1,
- height: Optional[int]=512,
- width: Optional[int]=512,
- guidance_scale: Optional[float]=7.5,
- num_inference_steps: Optional[int]=50,
- eta: Optional[float]=0.0, ) -> List[str]:
+ self,
+ prompts: List[str],
+ seeds: List[int],
+ num_interpolation_steps: Optional[int] = 6,
+ output_dir: Optional[str] = "./dreams",
+ name: Optional[str] = None,
+ batch_size: Optional[int] = 1,
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ guidance_scale: Optional[float] = 7.5,
+ num_inference_steps: Optional[int] = 50,
+ eta: Optional[float] = 0.0,
+ ) -> List[str]:
"""
Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
Args:
@@ -509,8 +489,7 @@ def walk(
frame_idx = 0
frame_filepaths = []
- for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:],
- seeds, seeds[1:]):
+ for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
# Embed Text
embed_a = self.embed_text(prompt_a)
embed_b = self.embed_text(prompt_b)
@@ -526,14 +505,10 @@ def walk(
noise = slerp(float(t), noise_a, noise_b)
embed = paddle.lerp(embed_a, embed_b, t)
- noise_batch = (noise if noise_batch is None else paddle.concat(
- [noise_batch, noise], axis=0))
- embeds_batch = (embed
- if embeds_batch is None else paddle.concat(
- [embeds_batch, embed], axis=0))
+ noise_batch = noise if noise_batch is None else paddle.concat([noise_batch, noise], axis=0)
+ embeds_batch = embed if embeds_batch is None else paddle.concat([embeds_batch, embed], axis=0)
- batch_is_ready = (embeds_batch.shape[0] == batch_size or
- i + 1 == T.shape[0])
+ batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
if batch_is_ready:
outputs = self(
latents=noise_batch,
@@ -542,12 +517,12 @@ def walk(
width=width,
guidance_scale=guidance_scale,
eta=eta,
- num_inference_steps=num_inference_steps, )
+ num_inference_steps=num_inference_steps,
+ )
noise_batch, embeds_batch = None, None
for image in outputs["images"]:
- frame_filepath = str(save_path /
- f"frame_{frame_idx:06d}.png")
+ frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
image.save(frame_filepath)
frame_filepaths.append(frame_filepath)
frame_idx += 1
diff --git a/ppdiffusers/examples/community/lpw_stable_diffusion.py b/ppdiffusers/examples/community/lpw_stable_diffusion.py
index 6870f3e68508a..c52d942b0b5a4 100644
--- a/ppdiffusers/examples/community/lpw_stable_diffusion.py
+++ b/ppdiffusers/examples/community/lpw_stable_diffusion.py
@@ -19,17 +19,18 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from paddlemix.utils.tools import compare_version
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipelines.stable_diffusion import (
- StableDiffusionPipeline, StableDiffusionPipelineOutput)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+ StableDiffusionPipeline,
+ StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import logging
if compare_version(PIL.__version__, "9.1.0") >= 0:
@@ -55,7 +56,8 @@
[^\\()\[\]:]+|
:
""",
- re.X, )
+ re.X,
+)
def parse_prompt_attention(text):
@@ -144,9 +146,7 @@ def multiply_range(start_position, multiplier):
return res
-def get_prompts_with_weights(pipe: StableDiffusionPipeline,
- prompt: List[str],
- max_length: int):
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
r"""
Tokenize a list of prompts and return its tokens with weights of each token.
No padding, starting or ending token is included.
@@ -176,32 +176,20 @@ def get_prompts_with_weights(pipe: StableDiffusionPipeline,
tokens.append(text_token)
weights.append(text_weight)
if truncated:
- logger.warning(
- "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
- )
+ logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
return tokens, weights
-def pad_tokens_and_weights(tokens,
- weights,
- max_length,
- bos,
- eos,
- pad,
- no_boseos_middle=True,
- chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
r"""
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
"""
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
- weights_length = (max_length if no_boseos_middle else
- max_embeddings_multiples * chunk_length)
+ weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
for i in range(len(tokens)):
- tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
- (max_length - 2 - len(tokens[i])))
+ tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
if no_boseos_middle:
- weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
- len(weights[i]))
+ weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
else:
w = []
if len(weights[i]) == 0:
@@ -209,8 +197,7 @@ def pad_tokens_and_weights(tokens,
else:
for j in range(max_embeddings_multiples):
w.append(1.0) # weight for starting token in this chunk
- w += weights[i][j * (chunk_length - 2):min(
- len(weights[i]), (j + 1) * (chunk_length - 2))]
+ w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
w.append(1.0) # weight for ending token in this chunk
w += [1.0] * (weights_length - len(w))
weights[i] = w[:]
@@ -219,10 +206,11 @@ def pad_tokens_and_weights(tokens,
def get_unweighted_text_embeddings(
- pipe: StableDiffusionPipeline,
- text_input: paddle.Tensor,
- chunk_length: int,
- no_boseos_middle: Optional[bool]=True, ):
+ pipe: StableDiffusionPipeline,
+ text_input: paddle.Tensor,
+ chunk_length: int,
+ no_boseos_middle: Optional[bool] = True,
+):
"""
When the length of tokens is a multiple of the capacity of the text encoder,
it should be split into chunks and sent to the text encoder individually.
@@ -232,8 +220,7 @@ def get_unweighted_text_embeddings(
text_embeddings = []
for i in range(max_embeddings_multiples):
# extract the i-th chunk
- text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
- chunk_length - 2) + 2].clone()
+ text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
# cover the head and the tail by the starting and the ending tokens
text_input_chunk[:, 0] = text_input[0, 0]
@@ -259,14 +246,15 @@ def get_unweighted_text_embeddings(
def get_weighted_text_embeddings(
- pipe: StableDiffusionPipeline,
- prompt: Union[str, List[str]],
- uncond_prompt: Optional[Union[str, List[str]]]=None,
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ pipe: StableDiffusionPipeline,
+ prompt: Union[str, List[str]],
+ uncond_prompt: Optional[Union[str, List[str]]] = None,
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+):
r"""
Prompts can be assigned with local weights using brackets. For example,
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -290,24 +278,19 @@ def get_weighted_text_embeddings(
skip_weighting (`bool`, *optional*, defaults to `False`):
Skip the weighting. When the parsing is skipped, it is forced True.
"""
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
if isinstance(prompt, str):
prompt = [prompt]
if not skip_parsing:
- prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
- max_length - 2)
+ prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
if uncond_prompt is not None:
if isinstance(uncond_prompt, str):
uncond_prompt = [uncond_prompt]
- uncond_tokens, uncond_weights = get_prompts_with_weights(
- pipe, uncond_prompt, max_length - 2)
+ uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
else:
prompt_tokens = [
- token[1:-1]
- for token in pipe.tokenizer(
- prompt, max_length=max_length, truncation=True).input_ids
+ token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
]
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
if uncond_prompt is not None:
@@ -315,33 +298,26 @@ def get_weighted_text_embeddings(
uncond_prompt = [uncond_prompt]
uncond_tokens = [
token[1:-1]
- for token in pipe.tokenizer(
- uncond_prompt, max_length=max_length, truncation=True)
- .input_ids
+ for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
]
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
# round up the longest length of tokens to a multiple of (model_max_length - 2)
max_length = max([len(token) for token in prompt_tokens])
if uncond_prompt is not None:
- max_length = max(max_length,
- max([len(token) for token in uncond_tokens]))
+ max_length = max(max_length, max([len(token) for token in uncond_tokens]))
max_embeddings_multiples = min(
max_embeddings_multiples,
- (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+ (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+ )
max_embeddings_multiples = max(1, max_embeddings_multiples)
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
# pad the length of tokens and weights
# support bert tokenizer
- bos = (pipe.tokenizer.bos_token_id
- if pipe.tokenizer.bos_token_id is not None else
- pipe.tokenizer.cls_token_id)
- eos = (pipe.tokenizer.eos_token_id
- if pipe.tokenizer.eos_token_id is not None else
- pipe.tokenizer.sep_token_id)
+ bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+ eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
pad = pipe.tokenizer.pad_token_id
prompt_tokens, prompt_weights = pad_tokens_and_weights(
prompt_tokens,
@@ -351,7 +327,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
if uncond_prompt is not None:
uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -362,7 +339,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
# get the embeddings
@@ -370,32 +348,28 @@ def get_weighted_text_embeddings(
pipe,
prompt_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- prompt_weights = paddle.to_tensor(
- prompt_weights, dtype=text_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
if uncond_prompt is not None:
uncond_embeddings = get_unweighted_text_embeddings(
pipe,
uncond_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- uncond_weights = paddle.to_tensor(
- uncond_weights, dtype=uncond_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
# assign weights to the prompts and normalize in the sense of mean
# TODO: should we normalize by chunk or in a whole (current implementation)?
if (not skip_parsing) and (not skip_weighting):
previous_mean = text_embeddings.mean(axis=[-2, -1])
text_embeddings *= prompt_weights.unsqueeze(-1)
- text_embeddings *= (
- (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
- .unsqueeze(-1))
+ text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
previous_mean = uncond_embeddings.mean(axis=[-2, -1])
uncond_embeddings *= uncond_weights.unsqueeze(-1)
- uncond_embeddings *= (
- (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
- .unsqueeze(-1).unsqueeze(-1))
+ uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
return text_embeddings, uncond_embeddings
@@ -416,8 +390,7 @@ def preprocess_mask(mask, scale_factor=8):
mask = mask.convert("L")
w, h = mask.size
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
- mask = mask.resize(
- (w // scale_factor, h // scale_factor), resample=Resampling.NEAREST)
+ mask = mask.resize((w // scale_factor, h // scale_factor), resample=Resampling.NEAREST)
mask = np.array(mask).astype(np.float32) / 255.0
mask = np.tile(mask, (4, 1, 1))
mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
@@ -454,16 +427,16 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- requires_safety_checker: Optional[bool]=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: Optional[bool] = True,
+ ):
super().__init__(
vae=vae,
text_encoder=text_encoder,
@@ -472,7 +445,8 @@ def __init__(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- requires_safety_checker=requires_safety_checker, )
+ requires_safety_checker=requires_safety_checker,
+ )
self.__init__additional__()
def __init__additional__(self):
@@ -480,10 +454,10 @@ def __init__additional__(self):
setattr(
self,
"vae_scale_factor",
- 2**(len(self.vae.config.block_out_channels) - 1), )
+ 2 ** (len(self.vae.config.block_out_channels) - 1),
+ )
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
@@ -510,34 +484,31 @@ def disable_attention_slicing(self):
def check_inputs(self, prompt, height, width, strength, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
def _encode_prompt(
- self,
- prompt: Union[str, List[str]],
- num_images_per_prompt: int,
- do_classifier_free_guidance: bool,
- negative_prompt: Union[str, List[str]],
- max_embeddings_multiples: Optional[int]=3,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ num_images_per_prompt: int,
+ do_classifier_free_guidance: bool,
+ negative_prompt: Union[str, List[str]],
+ max_embeddings_multiples: Optional[int] = 3,
+ **kwargs,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -564,28 +535,25 @@ def _encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
pipe=self,
prompt=prompt,
- uncond_prompt=negative_prompt
- if do_classifier_free_guidance else None,
+ uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
max_embeddings_multiples=max_embeddings_multiples,
- **kwargs, )
+ **kwargs,
+ )
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [1, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
return text_embeddings
@@ -602,29 +570,20 @@ def get_timesteps(self, num_inference_steps, strength, is_text2img):
timesteps = self.scheduler.timesteps[t_start:]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- height,
- width,
- dtype,
- generator,
- latents=None):
+ def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
if image is None:
shape = (
batch_size,
self.unet.in_channels,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if latents is None:
latents = paddle.randn(shape, generator=generator, dtype=dtype)
else:
if latents.shape != shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
@@ -644,27 +603,28 @@ def prepare_latents(self,
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- negative_prompt: Optional[Union[str, List[str]]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=512,
- width: Optional[int]=512,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[int]=7.5,
- strength: Optional[int]=0.8,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[int]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: Optional[bool]=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- is_cancelled_callback: Optional[Callable[[], bool]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[int] = 7.5,
+ strength: Optional[int] = 0.8,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[int] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: Optional[bool] = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ is_cancelled_callback: Optional[Callable[[], bool]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
Args:
@@ -753,7 +713,8 @@ def __call__(
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
- max_embeddings_multiples, )
+ max_embeddings_multiples,
+ )
dtype = text_embeddings.dtype
# 4. Preprocess image and mask
@@ -765,17 +726,14 @@ def __call__(
mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
if mask_image is not None:
mask_image = mask_image.astype(dtype=dtype)
- mask = paddle.concat([mask_image] * batch_size *
- num_images_per_prompt)
+ mask = paddle.concat([mask_image] * batch_size * num_images_per_prompt)
else:
mask = None
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(
- num_inference_steps, strength, image is None)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
latents, init_latents_orig, noise = self.prepare_latents(
@@ -786,7 +744,8 @@ def __call__(
width,
dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -794,46 +753,37 @@ def __call__(
# 8. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
if mask is not None:
# masking
- init_latents_proper = self.scheduler.add_noise(
- init_latents_orig, noise, t)
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
latents = (init_latents_proper * mask) + (latents * (1 - mask))
# call the callback, if provided
if i % callback_steps == 0:
if callback is not None:
callback(i, t, latents)
- if is_cancelled_callback is not None and is_cancelled_callback(
- ):
+ if is_cancelled_callback is not None and is_cancelled_callback():
return None
# 9. Post-processing
image = self.decode_latents(latents)
# 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- text_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
# 11. Convert to PIL
if output_type == "pil":
@@ -842,28 +792,28 @@ def __call__(
if not return_dict:
return image, has_nsfw_concept
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
def text2img(
- self,
- prompt: Union[str, List[str]],
- negative_prompt: Optional[Union[str, List[str]]]=None,
- height: Optional[int]=512,
- width: Optional[int]=512,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[int]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: Optional[bool]=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- is_cancelled_callback: Optional[Callable[[], bool]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[int] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: Optional[bool] = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ is_cancelled_callback: Optional[Callable[[], bool]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function for text-to-image generation.
Args:
@@ -936,26 +886,28 @@ def text2img(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- **kwargs, )
+ **kwargs,
+ )
def img2img(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- negative_prompt: Optional[Union[str, List[str]]]=None,
- strength: Optional[float]=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: Optional[bool]=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- is_cancelled_callback: Optional[Callable[[], bool]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ strength: Optional[float] = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: Optional[bool] = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ is_cancelled_callback: Optional[Callable[[], bool]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function for image-to-image generation.
Args:
@@ -1029,27 +981,29 @@ def img2img(
callback=callback,
is_cancelled_callback=is_cancelled_callback,
callback_steps=callback_steps,
- **kwargs, )
+ **kwargs,
+ )
def inpaint(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- negative_prompt: Optional[Union[str, List[str]]]=None,
- strength: Optional[float]=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: Optional[bool]=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- is_cancelled_callback: Optional[Callable[[], bool]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ strength: Optional[float] = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: Optional[bool] = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ is_cancelled_callback: Optional[Callable[[], bool]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function for inpaint.
Args:
@@ -1124,4 +1078,5 @@ def inpaint(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- **kwargs, )
+ **kwargs,
+ )
diff --git a/ppdiffusers/examples/community/mixture_tiling.py b/ppdiffusers/examples/community/mixture_tiling.py
index 62f8650648596..5ae0911810d10 100644
--- a/ppdiffusers/examples/community/mixture_tiling.py
+++ b/ppdiffusers/examples/community/mixture_tiling.py
@@ -23,17 +23,18 @@
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import logging
try:
from ligo.segments import segment
- from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+ from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPTextModel,
+ CLIPTokenizer,
+ )
except ImportError:
- raise ImportError(
- "Please install paddlenlp and ligo-segments to use the mixture pipeline")
+ raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """
Examples:
@@ -61,8 +62,7 @@
"""
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap):
+def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
"""Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
Returns a tuple with:
@@ -71,11 +71,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
- Starting coordinates of columns in pixel space
- Ending coordinates of columns in pixel space
"""
- px_row_init = 0 if tile_row == 0 else tile_row * (
- tile_height - tile_row_overlap)
+ px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
px_row_end = px_row_init + tile_height
- px_col_init = 0 if tile_col == 0 else tile_col * (
- tile_width - tile_col_overlap)
+ px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
px_col_end = px_col_init + tile_width
return px_row_init, px_row_end, px_col_init, px_col_end
@@ -85,8 +83,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap):
+def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
"""Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
Returns a tuple with:
@@ -96,15 +93,14 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
- Ending coordinates of columns in latent space
"""
px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
- tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
- return _pixel2latent_indices(px_row_init, px_row_end, px_col_init,
- px_col_end)
+ tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
+ return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
-def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap, rows,
- columns):
+def _tile2latent_exclusive_indices(
+ tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns
+):
"""Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
Returns a tuple with:
@@ -114,18 +110,17 @@ def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height,
- Ending coordinates of columns in latent space
"""
row_init, row_end, col_init, col_end = _tile2latent_indices(
- tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
+ tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
row_segment = segment(row_init, row_end)
col_segment = segment(col_init, col_end)
# Iterate over the rest of tiles, clipping the region for the current tile
for row in range(rows):
for column in range(columns):
if row != tile_row and column != tile_col:
- (clip_row_init, clip_row_end, clip_col_init,
- clip_col_end) = _tile2latent_indices(
- row, column, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
+ (clip_row_init, clip_row_end, clip_col_init, clip_col_end) = _tile2latent_indices(
+ row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
row_segment = row_segment - segment(clip_row_init, clip_row_end)
col_segment = col_segment - segment(clip_col_init, clip_col_end)
# return row_init, row_end, col_init, col_end
@@ -151,17 +146,17 @@ def decode_latents(self, latents, cpu_vae=False):
return self.numpy_to_pil(image)
-class StableDiffusionTilingPipeline(DiffusionPipeline,
- StableDiffusionExtrasMixin):
+class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin):
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
self.register_modules(
vae=vae,
@@ -170,7 +165,8 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
class SeedTilesMode(Enum):
"""Modes in which the latents of a particular tile can be re-seeded"""
@@ -180,22 +176,22 @@ class SeedTilesMode(Enum):
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[List[str]]],
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- eta: Optional[float]=0.0,
- seed: Optional[int]=None,
- tile_height: Optional[int]=512,
- tile_width: Optional[int]=512,
- tile_row_overlap: Optional[int]=256,
- tile_col_overlap: Optional[int]=256,
- guidance_scale_tiles: Optional[List[List[float]]]=None,
- seed_tiles: Optional[List[List[int]]]=None,
- seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full",
- seed_reroll_regions: Optional[List[Tuple[int, int, int, int,
- int]]]=None,
- cpu_vae: Optional[bool]=False, ):
+ self,
+ prompt: Union[str, List[List[str]]],
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ eta: Optional[float] = 0.0,
+ seed: Optional[int] = None,
+ tile_height: Optional[int] = 512,
+ tile_width: Optional[int] = 512,
+ tile_row_overlap: Optional[int] = 256,
+ tile_col_overlap: Optional[int] = 256,
+ guidance_scale_tiles: Optional[List[List[float]]] = None,
+ seed_tiles: Optional[List[List[int]]] = None,
+ seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
+ seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
+ cpu_vae: Optional[bool] = False,
+ ):
"""
Function to run the diffusion pipeline with tiling support.
@@ -221,24 +217,18 @@ def __call__(
A PIL image with the generated image.
"""
- if not isinstance(prompt, list) or not all(
- isinstance(row, list) for row in prompt):
- raise ValueError(
- f"`prompt` has to be a list of lists but is {type(prompt)}")
+ if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
+ raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
grid_rows = len(prompt)
grid_cols = len(prompt[0])
if not all(len(row) == grid_cols for row in prompt):
- raise ValueError(
- "All prompt rows must have the same number of prompt columns")
+ raise ValueError("All prompt rows must have the same number of prompt columns")
if not isinstance(seed_tiles_mode, str) and (
- not isinstance(seed_tiles_mode, list) or
- not all(isinstance(row, list) for row in seed_tiles_mode)):
- raise ValueError(
- f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}"
- )
+ not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
+ ):
+ raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
if isinstance(seed_tiles_mode, str):
- seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))]
- for row in prompt]
+ seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
modes = [mode.value for mode in self.SeedTilesMode]
if any(mode not in modes for row in seed_tiles_mode for mode in row):
raise ValueError(f"Seed tiles mode must be one of {modes}")
@@ -247,11 +237,9 @@ def __call__(
batch_size = 1
# create original noisy latents using the timesteps
- height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap
- )
+ height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
- latents_shape = (batch_size, self.unet.config.in_channels, height // 8,
- width // 8)
+ latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
generator = paddle.Generator().manual_seed(seed)
latents = paddle.randn(shape=latents_shape, generator=generator)
@@ -263,8 +251,8 @@ def __call__(
mode = seed_tiles_mode[row][col]
if mode == self.SeedTilesMode.FULL.value:
row_init, row_end, col_init, col_end = _tile2latent_indices(
- row, col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap)
+ row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
else:
row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices(
row,
@@ -274,29 +262,27 @@ def __call__(
tile_row_overlap,
tile_col_overlap,
grid_rows,
- grid_cols, )
- tile_generator = paddle.Generator().manual_seed(
- seed_tile)
- tile_shape = latents_shape[0], latents_shape[
- 1], row_end - row_init, col_end - col_init
- latents[:, :, row_init:row_end, col_init:
- col_end] = paddle.randn(
- shape=tile_shape, generator=tile_generator)
+ grid_cols,
+ )
+ tile_generator = paddle.Generator().manual_seed(seed_tile)
+ tile_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
+ latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
+ shape=tile_shape, generator=tile_generator
+ )
# overwrite again for seed reroll regions
for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
row_init, row_end, col_init, col_end = _pixel2latent_indices(
- row_init, row_end, col_init,
- col_end) # to latent space coordinates
+ row_init, row_end, col_init, col_end
+ ) # to latent space coordinates
reroll_generator = paddle.Generator().manual_seed(seed_reroll)
- region_shape = latents_shape[0], latents_shape[
- 1], row_end - row_init, col_end - col_init
+ region_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
- shape=region_shape, generator=reroll_generator)
+ shape=region_shape, generator=reroll_generator
+ )
# Prepare scheduler
- accepts_offset = "offset" in set(
- inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
extra_set_kwargs = {}
if accepts_offset:
extra_set_kwargs["offset"] = 1
@@ -306,17 +292,20 @@ def __call__(
latents = latents * self.scheduler.sigmas[0]
# get prompts text embeddings
- text_input = [[
- self.tokenizer(
- col,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- truncation=True,
- return_tensors="pd", ) for col in row
- ] for row in prompt]
- text_embeddings = [[
- self.text_encoder(col.input_ids)[0] for col in row
- ] for row in text_input]
+ text_input = [
+ [
+ self.tokenizer(
+ col,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pd",
+ )
+ for col in row
+ ]
+ for row in prompt
+ ]
+ text_embeddings = [[self.text_encoder(col.input_ids)[0] for col in row] for row in text_input]
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -328,32 +317,26 @@ def __call__(
for j in range(grid_cols):
max_length = text_input[i][j].input_ids.shape[-1]
uncond_input = self.tokenizer(
- [""] * batch_size,
- padding="max_length",
- max_length=max_length,
- return_tensors="pd")
- uncond_embeddings = self.text_encoder(
- uncond_input.input_ids)[0]
+ [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pd"
+ )
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings[i][j] = paddle.concat(
- x=[uncond_embeddings, text_embeddings[i][j]])
+ text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# Mask for tile weights strenght
- tile_weights = self._gaussian_weights(tile_width, tile_height,
- batch_size)
+ tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
# Diffusion timesteps
for i, t in tqdm(enumerate(self.scheduler.timesteps)):
@@ -363,33 +346,28 @@ def __call__(
noise_preds_row = []
for col in range(grid_cols):
px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
- row, col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
- tile_latents = latents[:, :, px_row_init:px_row_end,
- px_col_init:px_col_end]
+ row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
+ tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(x=[tile_latents] * 2)
- if do_classifier_free_guidance else
- tile_latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = (
+ paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
+ )
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings[row][col])[
- "sample"]
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[
+ "sample"
+ ]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- guidance = (guidance_scale
- if guidance_scale_tiles is None or
- guidance_scale_tiles[row][col] is None else
- guidance_scale_tiles[row][col])
- noise_pred_tile = noise_pred_uncond + guidance * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ guidance = (
+ guidance_scale
+ if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
+ else guidance_scale_tiles[row][col]
+ )
+ noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
noise_preds_row.append(noise_pred_tile)
noise_preds.append(noise_preds_row)
# Stitch noise predictions for all tiles
@@ -399,13 +377,12 @@ def __call__(
for row in range(grid_rows):
for col in range(grid_cols):
px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
- row, col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
- noise_pred[:, :, px_row_init:px_row_end, px_col_init:
- px_col_end] += (noise_preds[row][col] *
- tile_weights)
- contributors[:, :, px_row_init:px_row_end, px_col_init:
- px_col_end] += tile_weights
+ row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
+ noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
+ noise_preds[row][col] * tile_weights
+ )
+ contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
# Average overlapping areas with more than 1 contributor
noise_pred /= contributors
# compute the previous noisy sample x_t -> x_t-1
@@ -424,14 +401,16 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches):
latent_height = tile_height // 8
var = 0.01
midpoint = (latent_width - 1) / 2
- x_probs = [(exp(-(x - midpoint) * (x - midpoint) /
- (latent_width * latent_width) / (2 * var)) /
- sqrt(2 * pi * var)) for x in range(latent_width)]
+ x_probs = [
+ (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
+ for x in range(latent_width)
+ ]
midpoint = latent_height / 2
- y_probs = [(exp(-(y - midpoint) * (y - midpoint) /
- (latent_height * latent_height) / (2 * var)) /
- sqrt(2 * pi * var)) for y in range(latent_height)]
+ y_probs = [
+ (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
+ for y in range(latent_height)
+ ]
weights = np.outer(y_probs, x_probs)
return paddle.tile(
- x=paddle.to_tensor(data=weights),
- repeat_times=(nbatches, self.unet.config.in_channels, 1, 1))
+ x=paddle.to_tensor(data=weights), repeat_times=(nbatches, self.unet.config.in_channels, 1, 1)
+ )
diff --git a/ppdiffusers/examples/community/one_step_unet.py b/ppdiffusers/examples/community/one_step_unet.py
index 489cef26e01d8..5baffefdab061 100644
--- a/ppdiffusers/examples/community/one_step_unet.py
+++ b/ppdiffusers/examples/community/one_step_unet.py
@@ -24,15 +24,14 @@ def __init__(self, unet, scheduler):
self.register_modules(unet=unet, scheduler=scheduler)
def __call__(self):
- image = paddle.randn((1, self.unet.in_channels, self.unet.sample_size,
- self.unet.sample_size), )
+ image = paddle.randn(
+ (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+ )
timestep = 1
model_output = self.unet(image, timestep).sample
- scheduler_output = self.scheduler.step(model_output, timestep,
- image).prev_sample
+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
- result = (scheduler_output - scheduler_output +
- paddle.ones_like(scheduler_output))
+ result = scheduler_output - scheduler_output + paddle.ones_like(scheduler_output)
return result
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
index 218ef8d7ab49c..b32b422bd47ae 100644
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
+++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
@@ -23,17 +23,17 @@
from ppdiffusers import DiffusionPipeline
from ppdiffusers.pipelines.fastdeploy_utils import (
- FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
+ FastDeployDiffusionPipelineMixin,
+ FastDeployRuntimeModel,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
from ppdiffusers.utils import logging, randn_tensor
logger = logging.get_logger(__name__)
-class FastStableDiffusionHiresFixPipeline(DiffusionPipeline,
- FastDeployDiffusionPipelineMixin):
+class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-to-image generation with high resolution fixing(hires.fix) based on Stable Diffusion.
@@ -63,21 +63,20 @@ class FastStableDiffusionHiresFixPipeline(DiffusionPipeline,
feature_extractor ([`CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
"""
- _optional_components = [
- "vae_encoder", "safety_checker", "feature_extractor"
- ]
+ _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -102,7 +101,8 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
@@ -111,7 +111,7 @@ def get_timesteps(self, denoising_steps, denoising_strength):
self.scheduler.set_timesteps(steps)
t_start = max(steps - denoising_steps, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
if hasattr(self.scheduler, "step_index_offset"):
self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -119,48 +119,45 @@ def get_timesteps(self, denoising_steps, denoising_strength):
return timesteps.cast("float32"), denoising_steps
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- hr_scale,
- hr_resize_height,
- hr_resize_width,
- denoising_strength,
- latent_scale_mode,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ hr_scale,
+ hr_resize_height,
+ hr_resize_width,
+ denoising_strength,
+ latent_scale_mode,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if hr_scale < 0:
- raise ValueError(
- "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+ raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
raise ValueError(
@@ -168,9 +165,7 @@ def check_inputs(
)
if denoising_strength > 1 or denoising_strength < 0:
- raise ValueError(
- f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
- )
+ raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -188,14 +183,10 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
-
- def get_upscaled_width_and_height(self,
- width,
- height,
- hr_scale=2,
- hr_resize_width=0,
- hr_resize_height=0):
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
if hr_resize_width == 0 and hr_resize_height == 0:
hr_upscale_to_width = int(width * hr_scale)
hr_upscale_to_height = int(height * hr_scale)
@@ -221,36 +212,36 @@ def get_upscaled_width_and_height(self,
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=40,
- hires_ratio: Optional[float]=0.5,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- enable_hr: Optional[bool]=True,
- hr_scale: Optional[float]=2.0,
- hr_resize_width: Optional[int]=0,
- hr_resize_height: Optional[int]=0,
- denoising_strength: Optional[float]=0.7,
- latent_scale_mode: Optional[str]="nearest",
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 40,
+ hires_ratio: Optional[float] = 0.5,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ enable_hr: Optional[bool] = True,
+ hr_scale: Optional[float] = 2.0,
+ hr_resize_width: Optional[int] = 0,
+ hr_resize_height: Optional[int] = 0,
+ denoising_strength: Optional[float] = 0.7,
+ latent_scale_mode: Optional[str] = "nearest",
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -347,7 +338,8 @@ def __call__(
latent_scale_mode,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
@@ -373,7 +365,8 @@ def __call__(
height=height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -385,7 +378,8 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. Prepare timesteps
if enable_hr:
@@ -401,18 +395,17 @@ def __call__(
# 5. Prepare latent variables
if generator is None:
generator_state = paddle.get_cuda_rng_state()
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- generator_state)
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
else:
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- paddle.Generator().states_[generator])
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
height,
width,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -423,34 +416,29 @@ def __call__(
with self.progress_bar(total=sample_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -462,15 +450,13 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -483,19 +469,16 @@ def __call__(
# 8. determine the upscaled width and height for upscaled images
truncate_width = 0
truncate_height = 0
- (
- hr_upscale_to_width,
- hr_upscale_to_height, ) = self.get_upscaled_width_and_height(
- width,
- height,
- hr_scale=hr_scale,
- hr_resize_width=hr_resize_width,
- hr_resize_height=hr_resize_height, )
+ (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height(
+ width,
+ height,
+ hr_scale=hr_scale,
+ hr_resize_width=hr_resize_width,
+ hr_resize_height=hr_resize_height,
+ )
if hr_resize_width != 0 and hr_resize_height != 0:
- truncate_width = (hr_upscale_to_width - hr_resize_width
- ) // self.vae_scale_factor
- truncate_height = (hr_upscale_to_height - hr_resize_height
- ) // self.vae_scale_factor
+ truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+ truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
# 9. special case: do nothing if upscaling is not nesscessary
if hr_upscale_to_width == width and hr_upscale_to_height == height:
@@ -504,77 +487,69 @@ def __call__(
if enable_hr:
if do_controlnet:
- (
- control_image,
- control_conditioning_scale,
- ) = self.prepare_controlnet_cond(
+ (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond(
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
width=hr_upscale_to_width,
height=hr_upscale_to_height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 10. prepare init latents
- timesteps, hr_steps = self.get_timesteps(hr_steps,
- denoising_strength)
+ timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
init_timestep = timesteps[:1].tile([latents.shape[0]])
latents = F.interpolate(
latents,
size=(
hr_upscale_to_height // self.vae_scale_factor,
- hr_upscale_to_width // self.vae_scale_factor, ),
- mode=latent_scale_mode, )
- latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
- truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
- - (truncate_width + 1) // 2, ]
-
- noise = randn_tensor(
- latents.shape,
- dtype=latents.dtype,
- generator="initial_generator")
+ hr_upscale_to_width // self.vae_scale_factor,
+ ),
+ mode=latent_scale_mode,
+ )
+ latents = latents[
+ :,
+ :,
+ truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+ truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+ ]
+
+ noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
latents = self.scheduler.add_noise(latents, noise, init_timestep)
# 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
- extra_step_kwargs = self.prepare_extra_step_kwargs(
- "initial_generator", eta)
+ extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
# 12. denoising on hires.fix steps
num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
with self.progress_bar(total=hr_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else
- latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -586,16 +561,14 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -606,7 +579,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -617,11 +591,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
index bf9bbf48e6e90..2fb5aa69a20ee 100644
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
+++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
@@ -24,10 +24,12 @@
# from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
from ppdiffusers.pipelines.fastdeploy_utils import (
- FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
+ FastDeployDiffusionPipelineMixin,
+ FastDeployRuntimeModel,
+)
+
# from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import logging
try:
@@ -35,13 +37,11 @@
from paddlenlp.transformers import CLIPFeatureExtractor # CLIPTextModel,
from paddlenlp.transformers import CLIPTokenizer
except ImportError:
- raise ImportError(
- "Please install paddlenlp and ligo-segments to use the mixture pipeline")
+ raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
logger = logging.get_logger(__name__)
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap):
+def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
"""Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
Returns a tuple with:
@@ -50,11 +50,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
- Starting coordinates of columns in pixel space
- Ending coordinates of columns in pixel space
"""
- px_row_init = 0 if tile_row == 0 else tile_row * (
- tile_height - tile_row_overlap)
+ px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
px_row_end = px_row_init + tile_height
- px_col_init = 0 if tile_col == 0 else tile_col * (
- tile_width - tile_col_overlap)
+ px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
px_col_end = px_col_init + tile_width
return px_row_init, px_row_end, px_col_init, px_col_end
@@ -64,8 +62,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
- tile_row_overlap, tile_col_overlap):
+def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
"""Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
Returns a tuple with:
@@ -75,21 +72,21 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
- Ending coordinates of columns in latent space
"""
px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
- tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
- return _pixel2latent_indices(px_row_init, px_row_end, px_col_init,
- px_col_end)
+ tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
+ return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
def _tile2latent_exclusive_indices(
- tile_row,
- tile_col,
- tile_width,
- tile_height,
- tile_row_overlap,
- tile_col_overlap,
- rows,
- columns, ):
+ tile_row,
+ tile_col,
+ tile_width,
+ tile_height,
+ tile_row_overlap,
+ tile_col_overlap,
+ rows,
+ columns,
+):
"""Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
Returns a tuple with:
@@ -99,25 +96,22 @@ def _tile2latent_exclusive_indices(
- Ending coordinates of columns in latent space
"""
row_init, row_end, col_init, col_end = _tile2latent_indices(
- tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
- tile_col_overlap)
+ tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+ )
row_segment = segment(row_init, row_end)
col_segment = segment(col_init, col_end)
# Iterate over the rest of tiles, clipping the region for the current tile
for row in range(rows):
for column in range(columns):
if row != tile_row and column != tile_col:
- (
- clip_row_init,
- clip_row_end,
- clip_col_init,
- clip_col_end, ) = _tile2latent_indices(
- row,
- column,
- tile_width,
- tile_height,
- tile_row_overlap,
- tile_col_overlap, )
+ (clip_row_init, clip_row_end, clip_col_init, clip_col_end,) = _tile2latent_indices(
+ row,
+ column,
+ tile_width,
+ tile_height,
+ tile_row_overlap,
+ tile_col_overlap,
+ )
row_segment = row_segment - segment(clip_row_init, clip_row_end)
col_segment = col_segment - segment(clip_col_init, clip_col_end)
# return row_init, row_end, col_init, col_end
@@ -127,10 +121,7 @@ def _tile2latent_exclusive_indices(
class StableDiffusionExtrasMixin:
"""Mixin providing additional convenience method to Stable Diffusion pipelines"""
- def _decode_vae_latents(self,
- latents: paddle.Tensor,
- infer_op=None,
- **kwargs):
+ def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
latents_shape = latents.shape
output_shape = [
latents_shape[0],
@@ -143,7 +134,8 @@ def _decode_vae_latents(self,
images_vae = self.vae_decoder(
latent_sample=latents,
infer_op=infer_op,
- output_shape=output_shape, )[0]
+ output_shape=output_shape,
+ )[0]
return images_vae
@@ -163,19 +155,20 @@ def decode_latents(self, latents, cpu_vae=False):
return self.numpy_to_pil(image)
-class FastDeployStableDiffusionTilingPipeline(DiffusionPipeline,
- StableDiffusionExtrasMixin,
- FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionTilingPipeline(
+ DiffusionPipeline, StableDiffusionExtrasMixin, FastDeployDiffusionPipelineMixin
+):
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler],
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler],
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
self.register_modules(
vae_encoder=vae_encoder,
@@ -185,7 +178,8 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.post_init()
class SeedTilesMode(Enum):
@@ -196,24 +190,24 @@ class SeedTilesMode(Enum):
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[List[str]]],
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- eta: Optional[float]=0.0,
- seed: Optional[int]=None,
- tile_height: Optional[int]=512,
- tile_width: Optional[int]=512,
- tile_row_overlap: Optional[int]=256,
- tile_col_overlap: Optional[int]=256,
- guidance_scale_tiles: Optional[List[List[float]]]=None,
- seed_tiles: Optional[List[List[int]]]=None,
- seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full",
- seed_reroll_regions: Optional[List[Tuple[int, int, int, int,
- int]]]=None,
- # parse_prompt_type: Optional[str] = "lpw",
- # max_embeddings_multiples: Optional[int] = 3,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[List[str]]],
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ eta: Optional[float] = 0.0,
+ seed: Optional[int] = None,
+ tile_height: Optional[int] = 512,
+ tile_width: Optional[int] = 512,
+ tile_row_overlap: Optional[int] = 256,
+ tile_col_overlap: Optional[int] = 256,
+ guidance_scale_tiles: Optional[List[List[float]]] = None,
+ seed_tiles: Optional[List[List[int]]] = None,
+ seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
+ seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
+ # parse_prompt_type: Optional[str] = "lpw",
+ # max_embeddings_multiples: Optional[int] = 3,
+ infer_op_dict: Dict[str, str] = None,
+ ):
"""
Function to run the diffusion pipeline with tiling support.
@@ -244,24 +238,18 @@ def __call__(
"""
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
- if not isinstance(prompt, list) or not all(
- isinstance(row, list) for row in prompt):
- raise ValueError(
- f"`prompt` has to be a list of lists but is {type(prompt)}")
+ if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
+ raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
grid_rows = len(prompt)
grid_cols = len(prompt[0])
if not all(len(row) == grid_cols for row in prompt):
- raise ValueError(
- "All prompt rows must have the same number of prompt columns")
+ raise ValueError("All prompt rows must have the same number of prompt columns")
if not isinstance(seed_tiles_mode, str) and (
- not isinstance(seed_tiles_mode, list) or
- not all(isinstance(row, list) for row in seed_tiles_mode)):
- raise ValueError(
- f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}"
- )
+ not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
+ ):
+ raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
if isinstance(seed_tiles_mode, str):
- seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))]
- for row in prompt]
+ seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
modes = [mode.value for mode in self.SeedTilesMode]
if any(mode not in modes for row in seed_tiles_mode for mode in row):
raise ValueError(f"Seed tiles mode must be one of {modes}")
@@ -270,14 +258,14 @@ def __call__(
batch_size = 1
# create original noisy latents using the timesteps
- height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap
- )
+ height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
latents_shape = (
batch_size,
self.vae_decoder_num_latent_channels,
height // 8,
- width // 8, )
+ width // 8,
+ )
generator = paddle.Generator().manual_seed(seed)
latents = paddle.randn(shape=latents_shape, generator=generator)
@@ -295,49 +283,48 @@ def __call__(
tile_width,
tile_height,
tile_row_overlap,
- tile_col_overlap, )
+ tile_col_overlap,
+ )
else:
- (
- row_init,
- row_end,
- col_init,
- col_end, ) = _tile2latent_exclusive_indices(
- row,
- col,
- tile_width,
- tile_height,
- tile_row_overlap,
- tile_col_overlap,
- grid_rows,
- grid_cols, )
- tile_generator = paddle.Generator().manual_seed(
- seed_tile)
+ (row_init, row_end, col_init, col_end,) = _tile2latent_exclusive_indices(
+ row,
+ col,
+ tile_width,
+ tile_height,
+ tile_row_overlap,
+ tile_col_overlap,
+ grid_rows,
+ grid_cols,
+ )
+ tile_generator = paddle.Generator().manual_seed(seed_tile)
tile_shape = (
latents_shape[0],
latents_shape[1],
row_end - row_init,
- col_end - col_init, )
- latents[:, :, row_init:row_end, col_init:
- col_end] = paddle.randn(
- shape=tile_shape, generator=tile_generator)
+ col_end - col_init,
+ )
+ latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
+ shape=tile_shape, generator=tile_generator
+ )
# overwrite again for seed reroll regions
for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
row_init, row_end, col_init, col_end = _pixel2latent_indices(
- row_init, row_end, col_init,
- col_end) # to latent space coordinates
+ row_init, row_end, col_init, col_end
+ ) # to latent space coordinates
reroll_generator = paddle.Generator().manual_seed(seed_reroll)
region_shape = (
latents_shape[0],
latents_shape[1],
row_end - row_init,
- col_end - col_init, )
+ col_end - col_init,
+ )
latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
- shape=region_shape, generator=reroll_generator)
+ shape=region_shape, generator=reroll_generator
+ )
# Prepare scheduler
- accepts_offset = "offset" in set(
- inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
extra_set_kwargs = {}
if accepts_offset:
extra_set_kwargs["offset"] = 1
@@ -347,18 +334,22 @@ def __call__(
latents = latents * self.scheduler.sigmas[0]
# get prompts text embeddings
- text_input = [[
- self.tokenizer(
- col,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- truncation=True,
- return_tensors="pd", ) for col in row
- ] for row in prompt]
- text_embeddings = [[
- self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0]
- for col in row
- ] for row in text_input]
+ text_input = [
+ [
+ self.tokenizer(
+ col,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pd",
+ )
+ for col in row
+ ]
+ for row in prompt
+ ]
+ text_embeddings = [
+ [self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0] for col in row] for row in text_input
+ ]
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -373,29 +364,26 @@ def __call__(
[""] * batch_size,
padding="max_length",
max_length=max_length,
- return_tensors="pd", )
- uncond_embeddings = self.text_encoder(
- input_ids=uncond_input.input_ids.astype(np.int64))[0]
+ return_tensors="pd",
+ )
+ uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int64))[0]
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings[i][j] = paddle.concat(
- x=[uncond_embeddings, text_embeddings[i][j]])
+ text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# Mask for tile weights strenght
- tile_weights = self._gaussian_weights(tile_width, tile_height,
- batch_size)
+ tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
# Diffusion timesteps
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
@@ -406,48 +394,42 @@ def __call__(
for row in range(grid_rows):
noise_preds_row = []
for col in range(grid_cols):
- (
- px_row_init,
- px_row_end,
- px_col_init,
- px_col_end, ) = _tile2latent_indices(
- row,
- col,
- tile_width,
- tile_height,
- tile_row_overlap,
- tile_col_overlap, )
- tile_latents = latents[:, :, px_row_init:px_row_end,
- px_col_init:px_col_end]
+ (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices(
+ row,
+ col,
+ tile_width,
+ tile_height,
+ tile_row_overlap,
+ tile_col_overlap,
+ )
+ tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(x=[tile_latents] * 2)
- if do_classifier_free_guidance else
- tile_latents)
+ latent_model_input = (
+ paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
+ )
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=text_embeddings[row][col],
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
noise_pred = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- guidance = (guidance_scale
- if guidance_scale_tiles is None or
- guidance_scale_tiles[row][col] is None else
- guidance_scale_tiles[row][col])
- noise_pred_tile = noise_pred_uncond + guidance * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ guidance = (
+ guidance_scale
+ if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
+ else guidance_scale_tiles[row][col]
+ )
+ noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
noise_preds_row.append(noise_pred_tile)
noise_preds.append(noise_preds_row)
# Stitch noise predictions for all tiles
@@ -456,22 +438,18 @@ def __call__(
# Add each tile contribution to overall latents
for row in range(grid_rows):
for col in range(grid_cols):
- (
- px_row_init,
- px_row_end,
- px_col_init,
- px_col_end, ) = _tile2latent_indices(
- row,
- col,
- tile_width,
- tile_height,
- tile_row_overlap,
- tile_col_overlap, )
- noise_pred[:, :, px_row_init:px_row_end, px_col_init:
- px_col_end] += (noise_preds[row][col] *
- tile_weights)
- contributors[:, :, px_row_init:px_row_end, px_col_init:
- px_col_end] += tile_weights
+ (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices(
+ row,
+ col,
+ tile_width,
+ tile_height,
+ tile_row_overlap,
+ tile_col_overlap,
+ )
+ noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
+ noise_preds[row][col] * tile_weights
+ )
+ contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
# Average overlapping areas with more than 1 contributor
noise_pred /= contributors
# compute the previous noisy sample x_t -> x_t-1
@@ -481,10 +459,10 @@ def __call__(
t,
latents,
step_index=i,
- return_pred_original_sample=False, ).prev_sample
+ return_pred_original_sample=False,
+ ).prev_sample
else:
- latents = self.scheduler.step(noise_pred, t,
- latents).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
if i == len(self.scheduler.timesteps) - 1:
# sync for accuracy it/s measure
paddle.device.cuda.synchronize()
@@ -505,13 +483,15 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches):
latent_height = tile_height // 8
var = 0.01
midpoint = (latent_width - 1) / 2
- x_probs = [(exp(-(x - midpoint) * (x - midpoint) /
- (latent_width * latent_width) / (2 * var)) /
- sqrt(2 * pi * var)) for x in range(latent_width)]
+ x_probs = [
+ (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
+ for x in range(latent_width)
+ ]
midpoint = latent_height / 2
- y_probs = [(exp(-(y - midpoint) * (y - midpoint) /
- (latent_height * latent_height) / (2 * var)) /
- sqrt(2 * pi * var)) for y in range(latent_height)]
+ y_probs = [
+ (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
+ for y in range(latent_height)
+ ]
weights = np.outer(y_probs, x_probs)
return paddle.tile(
x=paddle.to_tensor(data=weights),
diff --git a/ppdiffusers/examples/community/reference_only.py b/ppdiffusers/examples/community/reference_only.py
index 7f3035e62a6ea..816ee95647862 100644
--- a/ppdiffusers/examples/community/reference_only.py
+++ b/ppdiffusers/examples/community/reference_only.py
@@ -20,24 +20,32 @@
import paddle
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from PIL import Image
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.models.cross_attention import CrossAttention
from ppdiffusers.models.transformer_2d import Transformer2DModelOutput
-from ppdiffusers.models.unet_2d_blocks import (ResnetBlock2D,
- Transformer2DModel, Upsample2D)
+from ppdiffusers.models.unet_2d_blocks import (
+ ResnetBlock2D,
+ Transformer2DModel,
+ Upsample2D,
+)
from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, check_min_version, deprecate,
- logging, randn_tensor, replace_example_docstring)
+from ppdiffusers.utils import (
+ PIL_INTERPOLATION,
+ check_min_version,
+ deprecate,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
check_min_version("0.14.1")
@@ -70,18 +78,14 @@
def stable_var(x, axis=None, unbiased=True, keepdim=False, name=None):
dtype = x.dtype
u = paddle.mean(x, axis=axis, keepdim=True, name=name)
- n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(
- paddle.numel(u), paddle.int64)
+ n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(paddle.numel(u), paddle.int64)
n = n.astype(dtype)
if unbiased:
one_const = paddle.ones([], x.dtype)
n = paddle.where(n > one_const, n - 1.0, one_const)
n = n**0.5
n.stop_gradient = True
- out = paddle.sum(paddle.pow((x - u) / n, 2),
- axis=axis,
- keepdim=keepdim,
- name=name)
+ out = paddle.sum(paddle.pow((x - u) / n, 2), axis=axis, keepdim=keepdim, name=name)
return out
@@ -94,11 +98,12 @@ def var_mean(x, axis=-1, keepdim=True, unbiased=True, correction=None):
def self_attn_forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+):
attn_output = None
if getattr(self, "enable_attn", False):
@@ -114,31 +119,34 @@ def self_attn_forward(
hidden_states=image_hidden_states,
encoder_hidden_states=image_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
latent_self_attn1_uc = self.processor(
self,
latent_hidden_states,
encoder_hidden_states=paddle.concat(
- [latent_hidden_states] + image_hidden_states.split(
- [chunk_num] *
- (image_hidden_states.shape[0] // chunk_num)),
- axis=1, ),
+ [latent_hidden_states]
+ + image_hidden_states.split([chunk_num] * (image_hidden_states.shape[0] // chunk_num)),
+ axis=1,
+ ),
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
latent_self_attn1_c = latent_self_attn1_uc.clone()
latent_self_attn1_c[self.current_uc_indices] = self.processor(
self,
hidden_states=latent_hidden_states[self.current_uc_indices],
- encoder_hidden_states=latent_hidden_states[
- self.current_uc_indices],
+ encoder_hidden_states=latent_hidden_states[self.current_uc_indices],
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
latent_self_attn1 = (
- self.current_style_fidelity * latent_self_attn1_c +
- (1.0 - self.current_style_fidelity) * latent_self_attn1_uc)
+ self.current_style_fidelity * latent_self_attn1_c
+ + (1.0 - self.current_style_fidelity) * latent_self_attn1_uc
+ )
else:
latent_self_attn1 = latent_self_attn1_uc
@@ -150,25 +158,28 @@ def self_attn_forward(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
return attn_output
def transformer_2d_model_forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- timestep=None,
- class_labels=None,
- cross_attention_kwargs=None,
- return_dict: bool=True, ):
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ class_labels=None,
+ cross_attention_kwargs=None,
+ return_dict: bool = True,
+):
x = self.original_forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
timestep=timestep,
class_labels=class_labels,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=return_dict, )[0]
+ return_dict=return_dict,
+ )[0]
output = None
if getattr(self, "enable_gn", False):
if self.gn_auto_machine_weight > self.gn_weight:
@@ -177,26 +188,20 @@ def transformer_2d_model_forward(
latent_hidden_states = x[:chunk_num] # uc, c
image_hidden_states = x[chunk_num:] # uc, c
- image_var, image_mean = var_mean(
- image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- var, mean = var_mean(
- latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+ image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
div_num = image_hidden_states.shape[0] // chunk_num
mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
- std_acc = paddle.maximum(var_acc,
- paddle.zeros_like(var_acc) + EPS)**0.5
+ std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
y_c = y_uc.clone()
- y_c[self.current_uc_indices] = latent_hidden_states[
- self.current_uc_indices]
- latent_hidden_states = (
- self.current_style_fidelity * y_c +
- (1.0 - self.current_style_fidelity) * y_uc)
+ y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+ latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
else:
latent_hidden_states = y_uc
output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -204,7 +209,7 @@ def transformer_2d_model_forward(
if output is None:
output = x
if not return_dict:
- return (output, )
+ return (output,)
return Transformer2DModelOutput(sample=output)
@@ -219,26 +224,20 @@ def resnet_block_2d_forward(self, input_tensor, temb):
latent_hidden_states = x[:chunk_num] # uc, c
image_hidden_states = x[chunk_num:] # uc, c
- image_var, image_mean = var_mean(
- image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- var, mean = var_mean(
- latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+ image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
div_num = image_hidden_states.shape[0] // chunk_num
mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
- std_acc = paddle.maximum(var_acc,
- paddle.zeros_like(var_acc) + EPS)**0.5
+ std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
y_c = y_uc.clone()
- y_c[self.current_uc_indices] = latent_hidden_states[
- self.current_uc_indices]
- latent_hidden_states = (
- self.current_style_fidelity * y_c +
- (1.0 - self.current_style_fidelity) * y_uc)
+ y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+ latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
else:
latent_hidden_states = y_uc
output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -259,26 +258,20 @@ def upsample_2d_forward(self, hidden_states, output_size=None):
latent_hidden_states = x[:chunk_num] # uc, c
image_hidden_states = x[chunk_num:] # uc, c
- image_var, image_mean = var_mean(
- image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- var, mean = var_mean(
- latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
- std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+ image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+ std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
div_num = image_hidden_states.shape[0] // chunk_num
mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
- std_acc = paddle.maximum(var_acc,
- paddle.zeros_like(var_acc) + EPS)**0.5
+ std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
y_c = y_uc.clone()
- y_c[self.current_uc_indices] = latent_hidden_states[
- self.current_uc_indices]
- latent_hidden_states = (
- self.current_style_fidelity * y_c +
- (1.0 - self.current_style_fidelity) * y_uc)
+ y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+ latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
else:
latent_hidden_states = y_uc
output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -316,26 +309,16 @@ def preprocess(image, resize_mode, width, height):
if isinstance(image, paddle.Tensor):
return image
elif isinstance(image, PIL.Image.Image):
- image = resize_image(
- resize_mode=resize_mode, im=image, width=width, height=height)
+ image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
image = [image]
if isinstance(image[0], PIL.Image.Image):
- image = [
- resize_image(
- resize_mode=resize_mode, im=im, width=width, height=height)
- for im in image
- ]
+ image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
w, h = image[0].size
- w, h = map(lambda x: x - x % 8,
- (w, h)) # resize to integer multiple of 8
+ w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -382,8 +365,7 @@ def resize(im, w, h):
resized = resize(im, src_w, src_h)
res = Image.new("RGB", (width, height))
- res.paste(
- resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+ res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
else:
ratio = width / height
@@ -394,31 +376,22 @@ def resize(im, w, h):
resized = resize(im, src_w, src_h)
res = Image.new("RGB", (width, height))
- res.paste(
- resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+ res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
if ratio < src_ratio:
fill_height = height // 2 - src_h // 2
+ res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
res.paste(
- resized.resize(
- (width, fill_height), box=(0, 0, width, 0)),
- box=(0, 0))
- res.paste(
- resized.resize(
- (width, fill_height),
- box=(0, resized.height, width, resized.height)),
- box=(0, fill_height + src_h), )
+ resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+ box=(0, fill_height + src_h),
+ )
elif ratio > src_ratio:
fill_width = width // 2 - src_w // 2
+ res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
res.paste(
- resized.resize(
- (fill_width, height), box=(0, 0, 0, height)),
- box=(0, 0))
- res.paste(
- resized.resize(
- (fill_width, height),
- box=(resized.width, 0, resized.width, height)),
- box=(fill_width + src_w, 0), )
+ resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+ box=(fill_width + src_w, 0),
+ )
return res
@@ -454,37 +427,33 @@ class ReferenceOnlyPipeline(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -492,11 +461,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -517,12 +482,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -533,12 +496,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -550,21 +510,23 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.attn_modules = None
self.gn_modules = None
def set_reference_only(
- self,
- attention_auto_machine_weight=1.0,
- gn_auto_machine_weight=1.0,
- current_style_fidelity=0.5,
- enable_attn=True,
- enable_gn=True,
- do_classifier_free_guidance=True, ):
+ self,
+ attention_auto_machine_weight=1.0,
+ gn_auto_machine_weight=1.0,
+ current_style_fidelity=0.5,
+ enable_attn=True,
+ enable_gn=True,
+ do_classifier_free_guidance=True,
+ ):
assert 0.0 <= attention_auto_machine_weight <= 1.0
assert 0.0 <= gn_auto_machine_weight <= 2.0
assert 0.0 <= current_style_fidelity <= 1.0
@@ -574,18 +536,14 @@ def set_reference_only(
module.enable_attn = enable_attn
module.attention_auto_machine_weight = attention_auto_machine_weight
module.current_style_fidelity = current_style_fidelity
- module.current_uc_indices = [
- 0
- ] if do_classifier_free_guidance else []
+ module.current_uc_indices = [0] if do_classifier_free_guidance else []
if self.gn_modules is not None:
for module in self.gn_modules:
module.enable_gn = enable_gn
module.gn_auto_machine_weight = gn_auto_machine_weight
module.current_style_fidelity = current_style_fidelity
- module.current_uc_indices = [
- 0
- ] if do_classifier_free_guidance else []
+ module.current_uc_indices = [0] if do_classifier_free_guidance else []
# init attn_modules
if self.attn_modules is None:
@@ -599,75 +557,54 @@ def set_reference_only(
hidden_size = self.unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(
- reversed(self.unet.config.block_out_channels))[block_id]
+ hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = self.unet.config.block_out_channels[block_id]
self_attn_processors_keys.append([name, hidden_size])
# sorted by (-hidden_size, name),down -> mid -> up.
- for i, (name, _) in enumerate(
- sorted(
- self_attn_processors_keys,
- key=lambda x: (-x[1], x[0]))):
+ for i, (name, _) in enumerate(sorted(self_attn_processors_keys, key=lambda x: (-x[1], x[0]))):
module = self.unet.get_sublayer(name)
- module.attn_weight = float(i) / float(
- len(self_attn_processors_keys))
+ module.attn_weight = float(i) / float(len(self_attn_processors_keys))
module.enable_attn = enable_attn
module.attention_auto_machine_weight = attention_auto_machine_weight
module.current_style_fidelity = current_style_fidelity
- module.current_uc_indices = [
- 0
- ] if do_classifier_free_guidance else []
+ module.current_uc_indices = [0] if do_classifier_free_guidance else []
attn_modules.append(module)
self.attn_modules = attn_modules
# init gn_modules
if self.gn_modules is None:
- gn_modules = [self.unet.mid_block.attentions[-1], ]
- self.unet.mid_block.attentions[
- -1].gn_weight = 0.0 # mid 0.0
+ gn_modules = [
+ self.unet.mid_block.attentions[-1],
+ ]
+ self.unet.mid_block.attentions[-1].gn_weight = 0.0 # mid 0.0
input_block_names = [
- ("down_blocks.1.resnets.0",
- "down_blocks.1.attentions.0"), # 4 2.0
- ("down_blocks.1.resnets.1",
- "down_blocks.1.attentions.1"), # 5 1.66
- ("down_blocks.2.resnets.0",
- "down_blocks.2.attentions.0"), # 7 1.33
- ("down_blocks.2.resnets.1",
- "down_blocks.2.attentions.1"), # 8 1.0
- ("down_blocks.3.resnets.0",
- ), # 10 0.66
- ("down_blocks.3.resnets.1",
- ), # 11 0.33
+ ("down_blocks.1.resnets.0", "down_blocks.1.attentions.0"), # 4 2.0
+ ("down_blocks.1.resnets.1", "down_blocks.1.attentions.1"), # 5 1.66
+ ("down_blocks.2.resnets.0", "down_blocks.2.attentions.0"), # 7 1.33
+ ("down_blocks.2.resnets.1", "down_blocks.2.attentions.1"), # 8 1.0
+ ("down_blocks.3.resnets.0",), # 10 0.66
+ ("down_blocks.3.resnets.1",), # 11 0.33
]
for w, block_names in enumerate(input_block_names):
module = self.unet.get_sublayer(block_names[-1])
- module.gn_weight = 1.0 - float(w) / float(
- len(input_block_names))
+ module.gn_weight = 1.0 - float(w) / float(len(input_block_names))
gn_modules.append(module)
output_block_names = [
- ("up_blocks.0.resnets.0",
- ), # 0 0.0
- ("up_blocks.0.resnets.1",
- ), # 1 0.25
- ("up_blocks.0.resnets.2",
- "up_blocks.0.upsamplers.0"), # 2 0.5
- ("up_blocks.1.resnets.0",
- "up_blocks.1.attentions.0"), # 3 0.75
- ("up_blocks.1.resnets.1",
- "up_blocks.1.attentions.1"), # 4 1.0
- ("up_blocks.1.resnets.2",
- "up_blocks.1.attentions.2"), # 5 1.25
- ("up_blocks.2.resnets.0",
- "up_blocks.2.attentions.0"), # 6 1.5
- ("up_blocks.2.resnets.1",
- "up_blocks.2.attentions.1"), # 7 1.75
+ ("up_blocks.0.resnets.0",), # 0 0.0
+ ("up_blocks.0.resnets.1",), # 1 0.25
+ ("up_blocks.0.resnets.2", "up_blocks.0.upsamplers.0"), # 2 0.5
+ ("up_blocks.1.resnets.0", "up_blocks.1.attentions.0"), # 3 0.75
+ ("up_blocks.1.resnets.1", "up_blocks.1.attentions.1"), # 4 1.0
+ ("up_blocks.1.resnets.2", "up_blocks.1.attentions.2"), # 5 1.25
+ ("up_blocks.2.resnets.0", "up_blocks.2.attentions.0"), # 6 1.5
+ ("up_blocks.2.resnets.1", "up_blocks.2.attentions.1"), # 7 1.75
]
for w, block_names in enumerate(output_block_names):
module = self.unet.get_sublayer(block_names[-1])
@@ -679,20 +616,19 @@ def set_reference_only(
module.enable_gn = enable_gn
module.gn_auto_machine_weight = gn_auto_machine_weight
module.current_style_fidelity = current_style_fidelity
- module.current_uc_indices = [
- 0
- ] if do_classifier_free_guidance else []
+ module.current_uc_indices = [0] if do_classifier_free_guidance else []
self.gn_modules = gn_modules
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -728,29 +664,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -758,8 +696,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -769,14 +706,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -786,46 +725,42 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -844,53 +779,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -903,17 +834,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -934,12 +867,13 @@ def prepare_latents(
return latents
def prepare_image_latents(
- self,
- image,
- batch_size,
- dtype,
- generator=None,
- do_classifier_free_guidance=False, ):
+ self,
+ image,
+ batch_size,
+ dtype,
+ generator=None,
+ do_classifier_free_guidance=False,
+ ):
if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -948,8 +882,7 @@ def prepare_image_latents(
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -965,33 +898,32 @@ def prepare_image_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[PIL.Image.Image, List[PIL.Image.Image],
- paddle.Tensor]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- # reference
- control_name: str="reference_only", # "none", "reference_only", "reference_adain", "reference_adain+attn"
- attention_auto_machine_weight: float=1.0,
- gn_auto_machine_weight: float=1.0,
- current_style_fidelity: float=0.5,
- resize_mode: int=-1, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ # reference
+ control_name: str = "reference_only", # "none", "reference_only", "reference_adain", "reference_adain+attn"
+ attention_auto_machine_weight: float = 1.0,
+ gn_auto_machine_weight: float = 1.0,
+ current_style_fidelity: float = 0.5,
+ resize_mode: int = -1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -1079,7 +1011,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -1101,7 +1034,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -1118,55 +1052,57 @@ def __call__(
width,
dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. reference_only
- enable_attn = ("only" in control_name or "attn" in control_name and
- image is not None and attention_auto_machine_weight > 0)
- enable_gn = ("adain" in control_name and image is not None and
- gn_auto_machine_weight > 0)
+ enable_attn = (
+ "only" in control_name
+ or "attn" in control_name
+ and image is not None
+ and attention_auto_machine_weight > 0
+ )
+ enable_gn = "adain" in control_name and image is not None and gn_auto_machine_weight > 0
self.set_reference_only(
attention_auto_machine_weight,
gn_auto_machine_weight,
current_style_fidelity,
enable_attn,
enable_gn,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
if enable_attn or enable_gn:
image = preprocess(image, resize_mode, width, height)
image_latents = self.prepare_image_latents(
- image, batch_size, dtype, generator,
- do_classifier_free_guidance)
+ image, batch_size, dtype, generator, do_classifier_free_guidance
+ )
prompt_embeds = prompt_embeds.tile([1 + image.shape[0], 1, 1])
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
if enable_attn or enable_gn:
- image_noise = randn_tensor(
- image_latents.shape, generator=generator, dtype=dtype)
+ image_noise = randn_tensor(image_latents.shape, generator=generator, dtype=dtype)
image_latent_model_input = self.scheduler.scale_model_input(
- self.scheduler.add_noise(image_latents, image_noise, t),
- t)
+ self.scheduler.add_noise(image_latents, image_noise, t), t
+ )
chunk_num = 2 if do_classifier_free_guidance else 1
noise_pred = self.unet(
- paddle.concat([
- latent_model_input,
- image_latent_model_input.cast(
- latent_model_input.dtype),
- ]),
+ paddle.concat(
+ [
+ latent_model_input,
+ image_latent_model_input.cast(latent_model_input.dtype),
+ ]
+ ),
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
@@ -1176,22 +1112,19 @@ def __call__(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -1204,8 +1137,7 @@ def __call__(
image = self.decode_latents(latents)
# 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 11. Convert to PIL
image = self.numpy_to_pil(image)
@@ -1214,11 +1146,9 @@ def __call__(
image = self.decode_latents(latents)
# 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
index 5d0dc0e26b395..25e821228b061 100644
--- a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
@@ -19,23 +19,27 @@
import numpy as np
import paddle
import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ppdiffusers.image_processor import VaeImageProcessor
from ppdiffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ppdiffusers.models import (AutoencoderKL, ControlNetModel,
- UNet2DConditionModel)
+from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import \
- MultiControlNetModel
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import (
+ MultiControlNetModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (check_min_version, deprecate, logging,
- randn_tensor, replace_example_docstring)
+from ppdiffusers.utils import (
+ check_min_version,
+ deprecate,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
check_min_version("0.16.1")
@@ -88,8 +92,7 @@
"""
-class StableDiffusionControlNetImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -126,17 +129,22 @@ class StableDiffusionControlNetImg2ImgPipeline(
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
- ControlNetModel], MultiControlNetModel, ],
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[
+ ControlNetModel,
+ List[ControlNetModel],
+ Tuple[ControlNetModel],
+ MultiControlNetModel,
+ ],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -166,25 +174,27 @@ def __init__(
controlnet=controlnet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
- self.image_processor = VaeImageProcessor(
- vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.control_image_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor,
do_convert_rgb=True,
- do_normalize=False, )
+ do_normalize=False,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- lora_scale: Optional[float]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -231,32 +241,36 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -264,33 +278,32 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif prompt is not None and type(prompt) is not type(
- negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -298,39 +311,38 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- dtype=self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -339,16 +351,13 @@ def run_safety_checker(self, image, dtype):
has_nsfw_concept = None
else:
if paddle.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(
- image, output_type="pil")
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
- feature_extractor_input = self.image_processor.numpy_to_pil(
- image)
- safety_checker_input = self.feature_extractor(
- feature_extractor_input, return_tensors="pd")
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
return image, has_nsfw_concept
def prepare_extra_step_kwargs(self, generator, eta):
@@ -357,48 +366,46 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- controlnet_conditioning_scale=1.0, ):
+ self,
+ prompt,
+ image,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -411,7 +418,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# `prompt` needs more sophisticated handling when there are multiple
# conditionings.
@@ -426,15 +434,12 @@ def check_inputs(
self.check_image(image, prompt, prompt_embeds)
elif isinstance(self.controlnet, MultiControlNetModel):
if not isinstance(image, list):
- raise TypeError(
- "For multiple controlnets: `image` must be type `list`")
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
# When `image` is a nested list:
# (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
elif any(isinstance(i, list) for i in image):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
elif len(image) != len(self.controlnet.nets):
raise ValueError(
f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
@@ -448,22 +453,18 @@ def check_inputs(
# Check `controlnet_conditioning_scale`
if isinstance(self.controlnet, ControlNetModel):
if not isinstance(controlnet_conditioning_scale, float):
- raise TypeError(
- "For single controlnet: `controlnet_conditioning_scale` must be type `float`."
- )
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
elif isinstance(self.controlnet, MultiControlNetModel):
if isinstance(controlnet_conditioning_scale, list):
- if any(
- isinstance(i, list)
- for i in controlnet_conditioning_scale):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
- elif isinstance(controlnet_conditioning_scale, list) and len(
- controlnet_conditioning_scale) != len(self.controlnet.nets):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
raise ValueError(
"For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
- " the same length as the number of controlnets")
+ " the same length as the number of controlnets"
+ )
else:
assert False
@@ -471,16 +472,18 @@ def check_image(self, image, prompt, prompt_embeds):
image_is_pil = isinstance(image, PIL.Image.Image)
image_is_tensor = isinstance(image, paddle.Tensor)
image_is_np = isinstance(image, np.ndarray)
- image_is_pil_list = isinstance(image, list) and isinstance(
- image[0], PIL.Image.Image)
- image_is_tensor_list = isinstance(image, list) and isinstance(
- image[0], paddle.Tensor)
- image_is_np_list = isinstance(image, list) and isinstance(image[0],
- np.ndarray)
-
- if (not image_is_pil and not image_is_tensor and not image_is_np and
- not image_is_pil_list and not image_is_tensor_list and
- not image_is_np_list):
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+ if (
+ not image_is_pil
+ and not image_is_tensor
+ and not image_is_np
+ and not image_is_pil_list
+ and not image_is_tensor_list
+ and not image_is_np_list
+ ):
raise TypeError(
f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
)
@@ -503,17 +506,17 @@ def check_image(self, image, prompt, prompt_embeds):
)
def prepare_control_image(
- self,
- image,
- width,
- height,
- batch_size,
- num_images_per_prompt,
- dtype,
- do_classifier_free_guidance=False,
- guess_mode=False, ):
- image = self.control_image_processor.preprocess(
- image, height=height, width=width).cast(dtype=paddle.float32)
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
image_batch_size = image.shape[0]
if image_batch_size == 1:
@@ -533,21 +536,14 @@ def prepare_control_image(
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -569,18 +565,15 @@ def prepare_latents(self,
elif isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(
- generator[i]) for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
- init_latents = self.vae.encode(image).latent_dist.sample(
- generator)
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
init_latents = self.vae.config.scaling_factor * init_latents
- if (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] == 0):
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -592,12 +585,11 @@ def prepare_latents(self,
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // init_latents.shape[0]
- init_latents = paddle.concat(
- [init_latents] * additional_image_per_prompt, axis=0)
- elif (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] != 0):
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
@@ -616,33 +608,44 @@ def prepare_latents(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray, List[
- paddle.Tensor], List[PIL.Image.Image], List[np.ndarray], ]=None,
- control_image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray,
- List[paddle.Tensor], List[
- PIL.Image.Image], List[np.ndarray], ]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_conditioning_scale: Union[float, List[float]]=0.8,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[
+ paddle.Tensor,
+ PIL.Image.Image,
+ np.ndarray,
+ List[paddle.Tensor],
+ List[PIL.Image.Image],
+ List[np.ndarray],
+ ] = None,
+ control_image: Union[
+ paddle.Tensor,
+ PIL.Image.Image,
+ np.ndarray,
+ List[paddle.Tensor],
+ List[PIL.Image.Image],
+ List[np.ndarray],
+ ] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -740,7 +743,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- controlnet_conditioning_scale, )
+ controlnet_conditioning_scale,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -757,20 +761,20 @@ def __call__(
controlnet = self.controlnet
- if isinstance(controlnet, MultiControlNetModel) and isinstance(
- controlnet_conditioning_scale, float):
- controlnet_conditioning_scale = [controlnet_conditioning_scale
- ] * len(controlnet.nets)
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
global_pool_conditions = (
controlnet.config.global_pool_conditions
- if isinstance(controlnet, ControlNetModel) else
- controlnet.nets[0].config.global_pool_conditions)
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
guess_mode = guess_mode or global_pool_conditions
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -778,10 +782,10 @@ def __call__(
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- lora_scale=text_encoder_lora_scale, )
+ lora_scale=text_encoder_lora_scale,
+ )
# 4. Prepare image
- image = self.image_processor.preprocess(image).cast(
- dtype=paddle.float32)
+ image = self.image_processor.preprocess(image).cast(dtype=paddle.float32)
# 5. Prepare controlnet_conditioning_image
if isinstance(controlnet, ControlNetModel):
@@ -793,7 +797,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
dtype=controlnet.dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
elif isinstance(controlnet, MultiControlNetModel):
control_images = []
@@ -806,7 +811,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
dtype=controlnet.dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
control_images.append(control_image_)
@@ -815,11 +821,11 @@ def __call__(
assert False
# 5. Prepare timesteps
- self.scheduler.set_timesteps(num_inference_steps, )
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ self.scheduler.set_timesteps(
+ num_inference_steps,
+ )
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
latents = self.prepare_latents(
@@ -828,28 +834,25 @@ def __call__(
batch_size,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# controlnet(s) inference
if guess_mode and do_classifier_free_guidance:
# Infer ControlNet only for the conditional batch.
control_model_input = latents
- control_model_input = self.scheduler.scale_model_input(
- control_model_input, t)
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
else:
control_model_input = latent_model_input
@@ -862,20 +865,17 @@ def __call__(
controlnet_cond=control_image,
conditioning_scale=controlnet_conditioning_scale,
guess_mode=guess_mode,
- return_dict=False, )
+ return_dict=False,
+ )
if guess_mode and do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged.
- down_block_res_samples = [
- paddle.concat([paddle.zeros_like(d), d])
- for d in down_block_res_samples
- ]
- mid_block_res_sample = paddle.concat([
- paddle.zeros_like(mid_block_res_sample),
- mid_block_res_sample
- ])
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = paddle.concat(
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
+ )
# predict the noise residual
noise_pred = self.unet(
@@ -885,35 +885,26 @@ def __call__(
cross_attention_kwargs=cross_attention_kwargs,
down_block_additional_residuals=down_block_res_samples,
mid_block_additional_residual=mid_block_res_sample,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- **extra_step_kwargs,
- return_dict=False)[0]
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self.vae.decode(
- latents / self.vae.config.scaling_factor, return_dict=False)[0]
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
else:
image = latents
has_nsfw_concept = None
@@ -923,11 +914,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
index 73eae51ab8e43..420f7c4ee7053 100644
--- a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
+++ b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
@@ -19,18 +19,21 @@
import paddle
from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ppdiffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ppdiffusers.utils import (
+ deprecate,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
logger = logging.get_logger(__name__)
@@ -80,37 +83,33 @@ class StableDiffusionHiresFixPipeline(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -118,11 +117,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -143,12 +138,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -159,12 +152,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -176,18 +166,20 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -223,29 +215,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -253,8 +247,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -264,14 +257,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -281,36 +276,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -319,7 +311,7 @@ def get_timesteps(self, denoising_steps, denoising_strength):
self.scheduler.set_timesteps(steps)
t_start = max(steps - denoising_steps, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
if hasattr(self.scheduler, "step_index_offset"):
self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -328,11 +320,10 @@ def get_timesteps(self, denoising_steps, denoising_strength):
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -351,62 +342,57 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- hr_scale,
- hr_resize_height,
- hr_resize_width,
- denoising_strength,
- latent_scale_mode,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ hr_scale,
+ hr_resize_height,
+ hr_resize_width,
+ denoising_strength,
+ latent_scale_mode,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if hr_scale < 0:
- raise ValueError(
- "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+ raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
raise ValueError(
@@ -414,9 +400,7 @@ def check_inputs(
)
if denoising_strength > 1 or denoising_strength < 0:
- raise ValueError(
- f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
- )
+ raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -434,17 +418,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -464,12 +450,7 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- def get_upscaled_width_and_height(self,
- width,
- height,
- hr_scale=2,
- hr_resize_width=0,
- hr_resize_height=0):
+ def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
if hr_resize_width == 0 and hr_resize_height == 0:
hr_upscale_to_width = int(width * hr_scale)
hr_upscale_to_height = int(height * hr_scale)
@@ -496,32 +477,32 @@ def get_upscaled_width_and_height(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=40,
- hires_ratio: Optional[float]=0.5,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- enable_hr: Optional[bool]=True,
- hr_scale: Optional[float]=2.0,
- hr_resize_width: Optional[int]=0,
- hr_resize_height: Optional[int]=0,
- denoising_strength: Optional[float]=0.7,
- latent_scale_mode: Optional[str]="nearest", ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 40,
+ hires_ratio: Optional[float] = 0.5,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ enable_hr: Optional[bool] = True,
+ hr_scale: Optional[float] = 2.0,
+ hr_resize_width: Optional[int] = 0,
+ hr_resize_height: Optional[int] = 0,
+ denoising_strength: Optional[float] = 0.7,
+ latent_scale_mode: Optional[str] = "nearest",
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -622,7 +603,8 @@ def __call__(
latent_scale_mode,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -644,7 +626,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
if enable_hr:
@@ -660,11 +643,9 @@ def __call__(
# 5. Prepare latent variables
if generator is None:
generator_state = paddle.get_cuda_rng_state()
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- generator_state)
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
else:
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- paddle.Generator().states_[generator])
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
num_channels_latents = self.unet.in_channels
latents = self.prepare_latents(
@@ -674,7 +655,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -684,32 +666,27 @@ def __call__(
with self.progress_bar(total=sample_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -719,85 +696,74 @@ def __call__(
# 8. determine the upscaled width and height for upscaled images
truncate_width = 0
truncate_height = 0
- (
- self.hr_upscale_to_width,
- self.hr_upscale_to_height,
- ) = self.get_upscaled_width_and_height(
+ (self.hr_upscale_to_width, self.hr_upscale_to_height,) = self.get_upscaled_width_and_height(
width,
height,
hr_scale=hr_scale,
hr_resize_width=hr_resize_width,
- hr_resize_height=hr_resize_height, )
+ hr_resize_height=hr_resize_height,
+ )
if hr_resize_width != 0 and hr_resize_height != 0:
- truncate_width = (self.hr_upscale_to_width - hr_resize_width
- ) // self.vae_scale_factor
- truncate_height = (self.hr_upscale_to_height - hr_resize_height
- ) // self.vae_scale_factor
+ truncate_width = (self.hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+ truncate_height = (self.hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
# 9. special case: do nothing if upscaling is not nesscessary
- if (self.hr_upscale_to_width == width and
- self.hr_upscale_to_height == height):
+ if self.hr_upscale_to_width == width and self.hr_upscale_to_height == height:
enable_hr = False
denoising_strength = None
if enable_hr:
# 10. prepare init latents
- timesteps, hr_steps = self.get_timesteps(hr_steps,
- denoising_strength)
+ timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
init_timestep = timesteps[:1].tile([latents.shape[0]])
latents = paddle.nn.functional.interpolate(
latents,
size=(
self.hr_upscale_to_height // self.vae_scale_factor,
- self.hr_upscale_to_width // self.vae_scale_factor, ),
- mode=latent_scale_mode, )
- latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
- truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
- - (truncate_width + 1) // 2, ]
-
- noise = randn_tensor(
- latents.shape,
- dtype=latents.dtype,
- generator="initial_generator")
+ self.hr_upscale_to_width // self.vae_scale_factor,
+ ),
+ mode=latent_scale_mode,
+ )
+ latents = latents[
+ :,
+ :,
+ truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+ truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+ ]
+
+ noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
latents = self.scheduler.add_noise(latents, noise, init_timestep)
# 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
- extra_step_kwargs = self.prepare_extra_step_kwargs(
- "initial_generator", eta)
+ extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
# 12. denoising on hires.fix steps
num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
with self.progress_bar(total=hr_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else
- latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -808,16 +774,13 @@ def __call__(
has_nsfw_concept = None
elif output_type == "pil":
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
image = self.numpy_to_pil(image)
else:
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_mega.py b/ppdiffusers/examples/community/stable_diffusion_mega.py
index ba2adb2a179ec..71ff024d88b08 100644
--- a/ppdiffusers/examples/community/stable_diffusion_mega.py
+++ b/ppdiffusers/examples/community/stable_diffusion_mega.py
@@ -21,30 +21,44 @@
import paddle
import PIL
import PIL.Image
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, ControlNetModel, DDIMScheduler, DDPMScheduler,
- DEISMultistepScheduler, DiffusionPipeline, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler, HeunDiscreteScheduler,
- KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler,
- LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel,
- UniPCMultistepScheduler)
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+ UniPCMultistepScheduler,
+)
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.loaders import (FromCkptMixin, LoraLoaderMixin,
- TextualInversionLoaderMixin)
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
+from ppdiffusers.loaders import (
+ FromCkptMixin,
+ LoraLoaderMixin,
+ TextualInversionLoaderMixin,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from ppdiffusers.pipelines.stable_diffusion.pipeline_cycle_diffusion import (
- compute_noise, posterior_sample)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+ compute_noise,
+ posterior_sample,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, deprecate, logging,
- randn_tensor)
+from ppdiffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -66,7 +80,8 @@
[^\\()\[\]:]+|
:
""",
- re.X, )
+ re.X,
+)
def parse_prompt_attention(text):
@@ -185,32 +200,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
tokens.append(text_token)
weights.append(text_weight)
if truncated:
- logger.warning(
- "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
- )
+ logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
return tokens, weights
-def pad_tokens_and_weights(tokens,
- weights,
- max_length,
- bos,
- eos,
- pad,
- no_boseos_middle=True,
- chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
r"""
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
"""
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
- weights_length = (max_length if no_boseos_middle else
- max_embeddings_multiples * chunk_length)
+ weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
for i in range(len(tokens)):
- tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
- (max_length - 2 - len(tokens[i])))
+ tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
if no_boseos_middle:
- weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
- len(weights[i]))
+ weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
else:
w = []
if len(weights[i]) == 0:
@@ -218,8 +221,7 @@ def pad_tokens_and_weights(tokens,
else:
for j in range(max_embeddings_multiples):
w.append(1.0) # weight for starting token in this chunk
- w += weights[i][j * (chunk_length - 2):min(
- len(weights[i]), (j + 1) * (chunk_length - 2))]
+ w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
w.append(1.0) # weight for ending token in this chunk
w += [1.0] * (weights_length - len(w))
weights[i] = w[:]
@@ -228,10 +230,11 @@ def pad_tokens_and_weights(tokens,
def get_unweighted_text_embeddings(
- pipe,
- text_input: paddle.Tensor,
- chunk_length: int,
- no_boseos_middle: Optional[bool]=True, ):
+ pipe,
+ text_input: paddle.Tensor,
+ chunk_length: int,
+ no_boseos_middle: Optional[bool] = True,
+):
"""
When the length of tokens is a multiple of the capacity of the text encoder,
it should be split into chunks and sent to the text encoder individually.
@@ -241,8 +244,7 @@ def get_unweighted_text_embeddings(
text_embeddings = []
for i in range(max_embeddings_multiples):
# extract the i-th chunk
- text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
- chunk_length - 2) + 2].clone()
+ text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
# cover the head and the tail by the starting and the ending tokens
text_input_chunk[:, 0] = text_input[0, 0]
@@ -268,14 +270,15 @@ def get_unweighted_text_embeddings(
def get_weighted_text_embeddings(
- pipe,
- prompt: Union[str, List[str]],
- uncond_prompt: Optional[Union[str, List[str]]]=None,
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ pipe,
+ prompt: Union[str, List[str]],
+ uncond_prompt: Optional[Union[str, List[str]]] = None,
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+):
r"""
Prompts can be assigned with local weights using brackets. For example,
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -299,24 +302,19 @@ def get_weighted_text_embeddings(
skip_weighting (`bool`, *optional*, defaults to `False`):
Skip the weighting. When the parsing is skipped, it is forced True.
"""
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
if isinstance(prompt, str):
prompt = [prompt]
if not skip_parsing:
- prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
- max_length - 2)
+ prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
if uncond_prompt is not None:
if isinstance(uncond_prompt, str):
uncond_prompt = [uncond_prompt]
- uncond_tokens, uncond_weights = get_prompts_with_weights(
- pipe, uncond_prompt, max_length - 2)
+ uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
else:
prompt_tokens = [
- token[1:-1]
- for token in pipe.tokenizer(
- prompt, max_length=max_length, truncation=True).input_ids
+ token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
]
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
if uncond_prompt is not None:
@@ -324,33 +322,26 @@ def get_weighted_text_embeddings(
uncond_prompt = [uncond_prompt]
uncond_tokens = [
token[1:-1]
- for token in pipe.tokenizer(
- uncond_prompt, max_length=max_length, truncation=True)
- .input_ids
+ for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
]
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
# round up the longest length of tokens to a multiple of (model_max_length - 2)
max_length = max([len(token) for token in prompt_tokens])
if uncond_prompt is not None:
- max_length = max(max_length,
- max([len(token) for token in uncond_tokens]))
+ max_length = max(max_length, max([len(token) for token in uncond_tokens]))
max_embeddings_multiples = min(
max_embeddings_multiples,
- (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+ (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+ )
max_embeddings_multiples = max(1, max_embeddings_multiples)
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
# pad the length of tokens and weights
# support bert tokenizer
- bos = (pipe.tokenizer.bos_token_id
- if pipe.tokenizer.bos_token_id is not None else
- pipe.tokenizer.cls_token_id)
- eos = (pipe.tokenizer.eos_token_id
- if pipe.tokenizer.eos_token_id is not None else
- pipe.tokenizer.sep_token_id)
+ bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+ eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
pad = pipe.tokenizer.pad_token_id
prompt_tokens, prompt_weights = pad_tokens_and_weights(
prompt_tokens,
@@ -360,7 +351,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
if uncond_prompt is not None:
uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -371,7 +363,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
# get the embeddings
@@ -379,43 +372,35 @@ def get_weighted_text_embeddings(
pipe,
prompt_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- prompt_weights = paddle.to_tensor(
- prompt_weights, dtype=text_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
if uncond_prompt is not None:
uncond_embeddings = get_unweighted_text_embeddings(
pipe,
uncond_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- uncond_weights = paddle.to_tensor(
- uncond_weights, dtype=uncond_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
# assign weights to the prompts and normalize in the sense of mean
# TODO: should we normalize by chunk or in a whole (current implementation)?
if (not skip_parsing) and (not skip_weighting):
previous_mean = text_embeddings.mean(axis=[-2, -1])
text_embeddings *= prompt_weights.unsqueeze(-1)
- text_embeddings *= (
- (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
- .unsqueeze(-1))
+ text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
previous_mean = uncond_embeddings.mean(axis=[-2, -1])
uncond_embeddings *= uncond_weights.unsqueeze(-1)
- uncond_embeddings *= (
- (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
- .unsqueeze(-1).unsqueeze(-1))
+ uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
return text_embeddings, uncond_embeddings
return text_embeddings, None
-def prepare_mask_and_masked_image(image,
- mask,
- height=None,
- width=None,
- return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -452,14 +437,11 @@ def prepare_mask_and_masked_image(image,
if isinstance(image, paddle.Tensor):
if not isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
- )
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
# Batch single image
if image.ndim == 3:
- assert (image.shape[0] == 3
- ), "Image outside a batch should be of shape (3, H, W)"
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
# Batch and add channel dim for single mask
@@ -476,12 +458,9 @@ def prepare_mask_and_masked_image(image,
else:
mask = mask.unsqueeze(1)
- assert (image.ndim == 4 and
- mask.ndim == 4), "Image and Mask must have 4 dimensions"
- assert (image.shape[-2:] == mask.shape[-2:]
- ), "Image and Mask must have the same spatial dimensions"
- assert (image.shape[0] == mask.shape[0]
- ), "Image and Mask must have the same batch size"
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
# Check image is in [-1, 1]
if image.min() < -1 or image.max() > 1:
@@ -498,8 +477,7 @@ def prepare_mask_and_masked_image(image,
# Image as float32
image = image.cast(dtype=paddle.float32)
elif isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
else:
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -510,13 +488,8 @@ def prepare_mask_and_masked_image(image,
w, h = image[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- image = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"])
- for i in image
- ]
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -535,14 +508,9 @@ def prepare_mask_and_masked_image(image,
w, h = mask[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- mask = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask
- ]
- mask = np.concatenate(
- [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -563,55 +531,45 @@ def prepare_mask_and_masked_image(image,
class CommonMixIn:
@property
def components(self) -> Dict[str, Any]:
- return {
- k: getattr(self, k)
- for k in self.config.keys() if not k.startswith("_")
- }
+ return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
def change_scheduler(self, scheduler_type="ddim"):
scheduler_type = scheduler_type.lower()
if scheduler_type == "pndm":
- scheduler = PNDMScheduler.from_config(
- self.orginal_scheduler_config, skip_prk_steps=True)
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "heun":
- scheduler = HeunDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler":
- scheduler = EulerDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-multi":
- scheduler = DPMSolverMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-single":
- scheduler = DPMSolverSinglestepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2-ancestral":
- scheduler = KDPM2AncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2":
- scheduler = KDPM2DiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "unipc-multi":
- scheduler = UniPCMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "ddim":
scheduler = DDIMScheduler.from_config(
self.orginal_scheduler_config,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
elif scheduler_type == "ddpm":
- scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
- )
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
elif scheduler_type == "deis-multi":
scheduler = DEISMultistepScheduler.from_config(
- self.orginal_scheduler_config, )
+ self.orginal_scheduler_config,
+ )
else:
raise ValueError(
f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -623,11 +581,10 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
return self.scheduler.timesteps, num_inference_steps
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
num_inference_steps = num_inference_steps - t_start
# check that number of inference steps is not < 1 - as this doesn't make sense
@@ -640,26 +597,26 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
return timesteps, num_inference_steps
def prepare_controlnet_cond(
- self,
- controlnet_cond,
- controlnet_conditioning_scale,
- width,
- height,
- batch_size,
- num_images_per_prompt,
- dtype,
- do_classifier_free_guidance=False,
- guess_mode=False, ):
+ self,
+ controlnet_cond,
+ controlnet_conditioning_scale,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
control_image = self.control_image_processor.preprocess(
controlnet_cond,
height=height,
- width=width, )
+ width=width,
+ )
if isinstance(controlnet_conditioning_scale, (float, int)):
- controlnet_conditioning_scale = paddle.to_tensor(
- [controlnet_conditioning_scale] * 13, dtype=dtype)
+ controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=dtype)
elif isinstance(controlnet_conditioning_scale, (list, tuple)):
- controlnet_conditioning_scale = paddle.to_tensor(
- controlnet_conditioning_scale, dtype=dtype)
+ controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=dtype)
else:
raise ValueError(
f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
@@ -678,40 +635,40 @@ def prepare_controlnet_cond(
return control_image, controlnet_conditioning_scale
def check_inputs(
- self,
- prompt,
- height=512,
- width=512,
- callback_steps=1,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- strength=1.0, ):
+ self,
+ prompt,
+ height=512,
+ width=512,
+ callback_steps=1,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ strength=1.0,
+ ):
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
raise ValueError(
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
)
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -724,24 +681,25 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
def prepare_latents(
- self,
- batch_size,
- height,
- width,
- generator,
- dtype=None,
- latents=None,
- image=None,
- timestep=None,
- is_strength_max=True,
- return_noise=False,
- return_image_latents=False, ):
+ self,
+ batch_size,
+ height,
+ width,
+ generator,
+ dtype=None,
+ latents=None,
+ image=None,
+ timestep=None,
+ is_strength_max=True,
+ return_noise=False,
+ return_image_latents=False,
+ ):
shape = [
batch_size,
self.vae.config.latent_channels,
@@ -762,53 +720,50 @@ def prepare_latents(
if return_image_latents or (latents is None and not is_strength_max):
image = image.cast(dtype=dtype)
- image_latents = self._encode_vae_image(
- image, batch_size=batch_size, generator=generator)
+ image_latents = self._encode_vae_image(image, batch_size=batch_size, generator=generator)
if latents is None:
noise = randn_tensor(shape, generator=generator, dtype=dtype)
# if strength is 1. then initialise the latents to noise, else initial to image + noise
- latents = (noise if is_strength_max else
- self.scheduler.add_noise(image_latents, noise, timestep))
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
# if pure noise then scale the initial latents by the Scheduler's init sigma
- latents = (latents * self.scheduler.init_noise_sigma
- if is_strength_max else latents)
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
else:
noise = latents
if str(noise.dtype).replace("paddle.", "") != dtype:
noise = noise.cast(dtype)
latents = noise * self.scheduler.init_noise_sigma
- outputs = (latents, )
+ outputs = (latents,)
if return_noise:
- outputs += (noise, )
+ outputs += (noise,)
if return_image_latents:
- outputs += (image_latents, )
+ outputs += (image_latents,)
if len(outputs) == 1:
outputs = latents
return outputs
def prepare_mask_latents(
- self,
- mask,
- masked_image,
- batch_size,
- height,
- width,
- generator,
- dtype,
- do_classifier_free_guidance=False,
- return_masked_image_latents=True, ):
+ self,
+ mask,
+ masked_image,
+ batch_size,
+ height,
+ width,
+ generator,
+ dtype,
+ do_classifier_free_guidance=False,
+ return_masked_image_latents=True,
+ ):
# resize the mask to latents shape as we concatenate the mask to the latents
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
# and half precision
mask = paddle.nn.functional.interpolate(
- mask,
- size=(height // self.vae_scale_factor,
- width // self.vae_scale_factor))
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+ )
mask = mask.cast(dtype=dtype)
# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -821,14 +776,12 @@ def prepare_mask_latents(
)
mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
- mask = paddle.concat([mask] *
- 2) if do_classifier_free_guidance else mask
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
if not return_masked_image_latents:
return mask
masked_image = masked_image.cast(dtype=dtype)
- masked_image_latents = self._encode_vae_image(
- masked_image, batch_size=batch_size, generator=generator)
+ masked_image_latents = self._encode_vae_image(masked_image, batch_size=batch_size, generator=generator)
if masked_image_latents.shape[0] < batch_size:
if not batch_size % masked_image_latents.shape[0] == 0:
raise ValueError(
@@ -836,31 +789,24 @@ def prepare_mask_latents(
f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
" Make sure the number of images that you pass is divisible by the total requested batch size."
)
- masked_image_latents = masked_image_latents.tile(
- [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+ masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
- masked_image_latents = (paddle.concat([masked_image_latents] * 2)
- if do_classifier_free_guidance else
- masked_image_latents)
+ masked_image_latents = (
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
# aligning device to prevent device errors when concating it with the latent model input
masked_image_latents = masked_image_latents.cast(dtype=dtype)
return mask, masked_image_latents
def is_scheduler_support_step_index(self):
- kwargs_keys = set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
- def _encode_vae_image(self,
- image: paddle.Tensor,
- batch_size=1,
- generator=None,
- **kwargs):
+ def _encode_vae_image(self, image: paddle.Tensor, batch_size=1, generator=None, **kwargs):
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -868,21 +814,24 @@ def _encode_vae_image(self,
return self.vae.config.scaling_factor * init_latents
def _decode_vae_latents(self, latents: paddle.Tensor, **kwargs):
- images_vae = self.vae.decode(latents, )[0]
+ images_vae = self.vae.decode(
+ latents,
+ )[0]
return images_vae
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- lora_scale: Optional[float]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- **kwargs, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ **kwargs,
+ ):
if parse_prompt_type == "lpw":
return self._encode_prompt_lpw(
prompt,
@@ -893,7 +842,8 @@ def _encode_prompt(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- **kwargs, )
+ **kwargs,
+ )
elif parse_prompt_type == "raw":
return self._encode_prompt_raw(
prompt,
@@ -902,22 +852,23 @@ def _encode_prompt(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- lora_scale=lora_scale, )
+ lora_scale=lora_scale,
+ )
elif parse_prompt_type == "webui":
- raise NotImplementedError(
- "`parse_prompt_type=webui` is not implemented yet.")
+ raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
def _encode_prompt_lpw(
- self,
- prompt: Union[str, List[str]],
- num_images_per_prompt: int,
- do_classifier_free_guidance: bool,
- negative_prompt: Union[str, List[str]],
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- lora_scale: Optional[float]=None,
- max_embeddings_multiples: Optional[int]=3,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ num_images_per_prompt: int,
+ do_classifier_free_guidance: bool,
+ negative_prompt: Union[str, List[str]],
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ max_embeddings_multiples: Optional[int] = 3,
+ **kwargs,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -953,66 +904,63 @@ def _encode_prompt_lpw(
if do_classifier_free_guidance:
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif prompt is not None and type(prompt) is not type(
- negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings(
pipe=self,
prompt=prompt,
uncond_prompt=uncond_tokens,
max_embeddings_multiples=max_embeddings_multiples,
- **kwargs, )
+ **kwargs,
+ )
prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- dtype=self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def _encode_prompt_raw(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- lora_scale: Optional[float]=None,
- **kwargs, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ **kwargs,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -1059,32 +1007,36 @@ def _encode_prompt_raw(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -1092,33 +1044,32 @@ def _encode_prompt_raw(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif prompt is not None and type(prompt) is not type(
- negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -1126,39 +1077,38 @@ def _encode_prompt_raw(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- dtype=self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -1167,16 +1117,13 @@ def run_safety_checker(self, image, dtype):
has_nsfw_concept = None
else:
if paddle.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(
- image, output_type="pil")
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
- feature_extractor_input = self.image_processor.numpy_to_pil(
- image)
- safety_checker_input = self.feature_extractor(
- feature_extractor_input, return_tensors="pd")
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
return image, has_nsfw_concept
def prepare_extra_step_kwargs(self, generator, eta):
@@ -1185,26 +1132,25 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
class StableDiffusionMegaPipeline(
- DiffusionPipeline,
- CommonMixIn,
- FromCkptMixin,
- LoraLoaderMixin,
- TextualInversionLoaderMixin, ):
+ DiffusionPipeline,
+ CommonMixIn,
+ FromCkptMixin,
+ LoraLoaderMixin,
+ TextualInversionLoaderMixin,
+):
r"""
Pipeline for mega using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -1239,37 +1185,33 @@ def __call__(self, *args, **kwargs):
return self.text2img(*args, **kwargs)
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- controlnet: ControlNetModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: ControlNetModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -1277,11 +1219,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -1310,15 +1248,16 @@ def __init__(
controlnet=controlnet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
- self.image_processor = VaeImageProcessor(
- vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.control_image_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor,
do_convert_rgb=True,
- do_normalize=False, )
+ do_normalize=False,
+ )
self.supported_scheduler = [
"pndm",
"lms",
@@ -1340,19 +1279,20 @@ def __init__(
@paddle.no_grad()
def do_unet(
- self,
- do_controlnet,
- latents,
- latent_model_input,
- t,
- i,
- prompt_embeds,
- control_image,
- control_conditioning_scale,
- cross_attention_kwargs,
- guess_mode,
- do_classifier_free_guidance,
- is_scheduler_support_step_index=False, ):
+ self,
+ do_controlnet,
+ latents,
+ latent_model_input,
+ t,
+ i,
+ prompt_embeds,
+ control_image,
+ control_conditioning_scale,
+ cross_attention_kwargs,
+ guess_mode,
+ do_classifier_free_guidance,
+ is_scheduler_support_step_index=False,
+ ):
if not do_controlnet:
# predict the noise residual
noise_pred_unet = self.unet(
@@ -1360,18 +1300,17 @@ def do_unet(
timestep=t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
else:
# controlnet inference
if guess_mode and do_classifier_free_guidance:
# Infer ControlNet only for the conditional batch.
control_model_input = latents
if is_scheduler_support_step_index:
- control_model_input = self.scheduler.scale_model_input(
- control_model_input, t, step_index=i)
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t, step_index=i)
else:
- control_model_input = self.scheduler.scale_model_input(
- control_model_input, t)
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
else:
control_model_input = latent_model_input
@@ -1384,20 +1323,15 @@ def do_unet(
controlnet_cond=control_image,
conditioning_scale=control_conditioning_scale,
guess_mode=guess_mode,
- return_dict=False, )
+ return_dict=False,
+ )
if guess_mode and do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged.
- down_block_res_samples = [
- paddle.concat([paddle.zeros_like(d), d])
- for d in down_block_res_samples
- ]
- mid_block_res_sample = paddle.concat([
- paddle.zeros_like(mid_block_res_sample),
- mid_block_res_sample
- ])
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = paddle.concat([paddle.zeros_like(mid_block_res_sample), mid_block_res_sample])
# predict the noise residual
noise_pred_unet = self.unet(
@@ -1407,35 +1341,36 @@ def do_unet(
cross_attention_kwargs=cross_attention_kwargs,
down_block_additional_residuals=down_block_res_samples,
mid_block_additional_residual=mid_block_res_sample,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
return noise_pred_unet
@paddle.no_grad()
def text2img(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -1535,7 +1470,8 @@ def text2img(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -1551,12 +1487,13 @@ def text2img(
do_classifier_free_guidance = guidance_scale > 1.0
guess_mode = guess_mode or (
- self.controlnet.config.global_pool_conditions
- if self.controlnet is not None else False)
+ self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+ )
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -1566,7 +1503,8 @@ def text2img(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# do_controlnet
@@ -1583,7 +1521,8 @@ def text2img(
dtype=dtype,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
@@ -1598,27 +1537,24 @@ def text2img(
width,
generator=generator,
dtype=dtype,
- latents=latents, )
+ latents=latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -1637,10 +1573,8 @@ def text2img(
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
@@ -1651,22 +1585,19 @@ def text2img(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -1677,43 +1608,41 @@ def text2img(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def img2img(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -1828,10 +1757,10 @@ def img2img(
controlnet_conditioning_scale=controlnet_conditioning_scale,
guess_mode=guess_mode,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
# 0. Preprocess image
- init_image = self.image_processor.preprocess(
- image, height=height, width=width)
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
height, width = init_image.shape[-2:]
# 1. Check inputs. Raise error if not correct
@@ -1843,7 +1772,8 @@ def img2img(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -1857,12 +1787,13 @@ def img2img(
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
guess_mode = guess_mode or (
- self.controlnet.config.global_pool_conditions
- if self.controlnet is not None else False)
+ self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+ )
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -1872,7 +1803,8 @@ def img2img(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# do_controlnet
@@ -1889,19 +1821,18 @@ def img2img(
dtype=dtype,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. Prepare latent variables
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
latents = self.prepare_latents(
@@ -1913,21 +1844,19 @@ def img2img(
latents=latents,
image=init_image,
timestep=latent_timestep,
- is_strength_max=is_strength_max, )
+ is_strength_max=is_strength_max,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -1940,35 +1869,26 @@ def img2img(
control_conditioning_scale,
cross_attention_kwargs,
guess_mode,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- **extra_step_kwargs,
- return_dict=False)[0]
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
latents = latents.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -1979,45 +1899,43 @@ def img2img(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def inpaint_legacy(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: int=None,
- width: int=None,
- strength: float=1.0,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: int = None,
+ width: int = None,
+ strength: float = 1.0,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -2122,7 +2040,8 @@ def inpaint_legacy(
mask_image,
height,
width,
- return_image=True, )
+ return_image=True,
+ )
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -2134,7 +2053,8 @@ def inpaint_legacy(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -2149,12 +2069,13 @@ def inpaint_legacy(
do_classifier_free_guidance = guidance_scale > 1.0
guess_mode = guess_mode or (
- self.controlnet.config.global_pool_conditions
- if self.controlnet is not None else False)
+ self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+ )
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -2164,7 +2085,8 @@ def inpaint_legacy(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# do_controlnet
@@ -2181,18 +2103,17 @@ def inpaint_legacy(
dtype=dtype,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
# 4. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
@@ -2208,7 +2129,8 @@ def inpaint_legacy(
timestep=latent_timestep,
is_strength_max=is_strength_max,
return_noise=True,
- return_image_latents=True, )
+ return_image_latents=True,
+ )
# 6. Prepare mask latent variables
mask = self.prepare_mask_latents(
@@ -2220,26 +2142,24 @@ def inpaint_legacy(
dtype=dtype,
generator=generator,
do_classifier_free_guidance=do_classifier_free_guidance,
- return_masked_image_latents=False, )
+ return_masked_image_latents=False,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
if do_classifier_free_guidance:
- init_mask = mask[:mask.shape[0] // 2]
+ init_mask = mask[: mask.shape[0] // 2]
else:
init_mask = mask
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -2252,51 +2172,39 @@ def inpaint_legacy(
control_conditioning_scale,
cross_attention_kwargs,
guess_mode,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- **extra_step_kwargs,
- return_dict=False)[0]
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if i < len(timesteps) - 1:
# masking
if add_predicted_noise:
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise_pred_uncond, t)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
else:
# https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
noise_timestep = timesteps[i + 1]
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise, noise_timestep)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
else:
init_latents_proper = image_latents
- latents = (1 - init_mask
- ) * init_latents_proper + init_mask * latents
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
latents = latents.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -2307,45 +2215,43 @@ def inpaint_legacy(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def inpaint(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: int=None,
- width: int=None,
- strength: float=1.0,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: int = None,
+ width: int = None,
+ strength: float = 1.0,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -2452,7 +2358,8 @@ def inpaint(
mask_image,
height,
width,
- return_image=True, )
+ return_image=True,
+ )
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -2464,7 +2371,8 @@ def inpaint(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -2480,12 +2388,13 @@ def inpaint(
do_classifier_free_guidance = guidance_scale > 1.0
guess_mode = guess_mode or (
- self.controlnet.config.global_pool_conditions
- if self.controlnet is not None else False)
+ self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+ )
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -2495,16 +2404,15 @@ def inpaint(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# 4. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
@@ -2524,7 +2432,8 @@ def inpaint(
timestep=latent_timestep,
is_strength_max=is_strength_max,
return_noise=True,
- return_image_latents=return_image_latents, )
+ return_image_latents=return_image_latents,
+ )
if return_image_latents:
latents, noise, image_latents = latents_outputs
@@ -2541,29 +2450,27 @@ def inpaint(
dtype=dtype,
generator=generator,
do_classifier_free_guidance=do_classifier_free_guidance,
- return_masked_image_latents=True, )
+ return_masked_image_latents=True,
+ )
# 7. Check that sizes of mask, masked image and latents match
if num_channels_unet == 9:
# default case for runwayml/stable-diffusion-inpainting
num_channels_mask = mask.shape[1]
num_channels_masked_image = masked_image_latents.shape[1]
- if (num_channels_latents + num_channels_mask +
- num_channels_masked_image != self.unet.config.in_channels):
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
raise ValueError(
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
- " `pipeline.unet` or your `mask_image` or `image` input.")
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
elif num_channels_unet != 4:
- raise ValueError(
- f"The unet should have either 4 or 9 input channels, not {num_channels_unet}."
- )
+ raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
# do_controlnet
- do_controlnet = (controlnet_cond is not None and
- self.controlnet is not None and is_legacy)
+ do_controlnet = controlnet_cond is not None and self.controlnet is not None and is_legacy
if not do_controlnet:
guess_mode = False
if do_controlnet:
@@ -2576,7 +2483,8 @@ def inpaint(
num_images_per_prompt=num_images_per_prompt,
dtype=dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
@@ -2584,26 +2492,21 @@ def inpaint(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
if do_classifier_free_guidance:
- init_mask = mask[:mask.shape[0] // 2]
+ init_mask = mask[: mask.shape[0] // 2]
else:
init_mask = mask
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
if not is_legacy:
# concat latents, mask, masked_image_latents in the channel dimension
- latent_model_input = paddle.concat(
- [latent_model_input, mask, masked_image_latents],
- axis=1)
+ latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -2616,51 +2519,39 @@ def inpaint(
control_conditioning_scale,
cross_attention_kwargs,
guess_mode,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- **extra_step_kwargs,
- return_dict=False)[0]
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if is_legacy:
if i < len(timesteps) - 1:
# masking
if add_predicted_noise:
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise_pred_uncond, t)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
else:
# https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
noise_timestep = timesteps[i + 1]
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise, noise_timestep)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
else:
init_latents_proper = image_latents
- latents = (1 - init_mask
- ) * init_latents_proper + init_mask * latents
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
latents = latents.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -2671,57 +2562,54 @@ def inpaint(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
def check_inputs_hires_fix(
- self,
- prompt,
- height,
- width,
- callback_steps,
- hr_scale,
- hr_resize_height,
- hr_resize_width,
- denoising_strength,
- latent_scale_mode,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ hr_scale,
+ hr_resize_height,
+ hr_resize_width,
+ denoising_strength,
+ latent_scale_mode,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
raise ValueError(
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
)
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if hr_scale < 0:
- raise ValueError(
- "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+ raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
raise ValueError(
@@ -2729,9 +2617,7 @@ def check_inputs_hires_fix(
)
if denoising_strength > 1 or denoising_strength < 0:
- raise ValueError(
- f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
- )
+ raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -2749,14 +2635,10 @@ def check_inputs_hires_fix(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
-
- def get_upscaled_width_and_height(self,
- width,
- height,
- hr_scale=2,
- hr_resize_width=0,
- hr_resize_height=0):
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
if hr_resize_width == 0 and hr_resize_height == 0:
hr_upscale_to_width = int(width * hr_scale)
hr_upscale_to_height = int(height * hr_scale)
@@ -2783,42 +2665,42 @@ def get_upscaled_width_and_height(self,
def get_hires_fix_timesteps(self, denoising_steps, denoising_strength):
steps = int(denoising_steps / min(denoising_strength, 0.999))
self.scheduler.set_timesteps(steps)
- timesteps = self.scheduler.timesteps[steps - denoising_steps:]
+ timesteps = self.scheduler.timesteps[steps - denoising_steps :]
return timesteps, denoising_steps
@paddle.no_grad()
def hires_fix(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=40,
- hires_ratio: Optional[float]=0.5,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- enable_hr: Optional[bool]=True,
- hr_scale: Optional[float]=2.0,
- hr_resize_width: Optional[int]=0,
- hr_resize_height: Optional[int]=0,
- denoising_strength: Optional[float]=0.7,
- latent_scale_mode: Optional[str]="nearest",
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 40,
+ hires_ratio: Optional[float] = 0.5,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ enable_hr: Optional[bool] = True,
+ hr_scale: Optional[float] = 2.0,
+ hr_resize_width: Optional[int] = 0,
+ hr_resize_height: Optional[int] = 0,
+ denoising_strength: Optional[float] = 0.7,
+ latent_scale_mode: Optional[str] = "nearest",
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -2942,7 +2824,8 @@ def hires_fix(
latent_scale_mode,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -2958,12 +2841,13 @@ def hires_fix(
do_classifier_free_guidance = guidance_scale > 1.0
guess_mode = guess_mode or (
- self.controlnet.config.global_pool_conditions
- if self.controlnet is not None else False)
+ self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+ )
# 3. Encode input prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
@@ -2973,7 +2857,8 @@ def hires_fix(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# do_controlnet
@@ -2990,7 +2875,8 @@ def hires_fix(
dtype=dtype,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
@@ -3009,11 +2895,9 @@ def hires_fix(
# 5. Prepare latent variables
if generator is None:
generator_state = paddle.get_cuda_rng_state()
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- generator_state)
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
else:
- paddle.Generator().states_["initial_generator"] = copy.deepcopy(
- paddle.Generator().states_[generator])
+ paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
@@ -3021,7 +2905,8 @@ def hires_fix(
width,
generator=generator,
dtype=dtype,
- latents=latents, )
+ latents=latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -3032,10 +2917,8 @@ def hires_fix(
with self.progress_bar(total=sample_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -3048,25 +2931,21 @@ def hires_fix(
control_conditioning_scale,
cross_attention_kwargs,
guess_mode,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
- scheduler_output = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -3076,19 +2955,16 @@ def hires_fix(
# 8. determine the upscaled width and height for upscaled images
truncate_width = 0
truncate_height = 0
- (
- hr_upscale_to_width,
- hr_upscale_to_height, ) = self.get_upscaled_width_and_height(
- width,
- height,
- hr_scale=hr_scale,
- hr_resize_width=hr_resize_width,
- hr_resize_height=hr_resize_height, )
+ (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height(
+ width,
+ height,
+ hr_scale=hr_scale,
+ hr_resize_width=hr_resize_width,
+ hr_resize_height=hr_resize_height,
+ )
if hr_resize_width != 0 and hr_resize_height != 0:
- truncate_width = (hr_upscale_to_width - hr_resize_width
- ) // self.vae_scale_factor
- truncate_height = (hr_upscale_to_height - hr_resize_height
- ) // self.vae_scale_factor
+ truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+ truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
# 9. special case: do nothing if upscaling is not nesscessary
if hr_upscale_to_width == width and hr_upscale_to_height == height:
@@ -3097,10 +2973,7 @@ def hires_fix(
if enable_hr:
if do_controlnet:
- (
- control_image,
- control_conditioning_scale,
- ) = self.prepare_controlnet_cond(
+ (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond(
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
width=hr_upscale_to_width,
@@ -3109,45 +2982,43 @@ def hires_fix(
num_images_per_prompt=num_images_per_prompt,
dtype=dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
else:
control_image = None
control_conditioning_scale = None
# 10. prepare init latents
- timesteps, hr_steps = self.get_hires_fix_timesteps(
- hr_steps, denoising_strength)
+ timesteps, hr_steps = self.get_hires_fix_timesteps(hr_steps, denoising_strength)
init_timestep = timesteps[:1].tile([latents.shape[0]])
latents = paddle.nn.functional.interpolate(
latents,
size=(
hr_upscale_to_height // self.vae_scale_factor,
- hr_upscale_to_width // self.vae_scale_factor, ),
- mode=latent_scale_mode, )
- latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
- truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
- - (truncate_width + 1) // 2, ]
-
- noise = randn_tensor(
- latents.shape,
- dtype=latents.dtype,
- generator="initial_generator")
+ hr_upscale_to_width // self.vae_scale_factor,
+ ),
+ mode=latent_scale_mode,
+ )
+ latents = latents[
+ :,
+ :,
+ truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+ truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+ ]
+
+ noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
latents = self.scheduler.add_noise(latents, noise, init_timestep)
# 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
- extra_step_kwargs = self.prepare_extra_step_kwargs(
- "initial_generator", eta)
+ extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
# 12. denoising on hires.fix steps
num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
with self.progress_bar(total=hr_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else
- latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred_unet = self.do_unet(
do_controlnet,
@@ -3160,31 +3031,26 @@ def hires_fix(
control_conditioning_scale,
cross_attention_kwargs,
guess_mode,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = latents.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -3195,42 +3061,40 @@ def hires_fix(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def cycle_diffusion(
- self,
- prompt: Union[str, List[str]],
- source_prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[paddle.Tensor]=None,
- source_guidance_scale: Optional[float]=1,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.1,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]],
+ source_prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[paddle.Tensor] = None,
+ source_guidance_scale: Optional[float] = 1,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.1,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -3310,8 +3174,7 @@ def cycle_diffusion(
"""
self.change_scheduler("ddim")
# 0. Preprocess image
- init_image = self.image_processor.preprocess(
- image, height=height, width=width)
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -3323,7 +3186,8 @@ def cycle_diffusion(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -3339,8 +3203,9 @@ def cycle_diffusion(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode target prompt and source prompt
- text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
- cross_attention_kwargs is not None else None)
+ text_encoder_lora_scale = (
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+ )
prompt_embeds = self._encode_prompt(
prompt,
@@ -3351,24 +3216,24 @@ def cycle_diffusion(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
source_prompt_embeds = self._encode_prompt(
source_prompt,
num_images_per_prompt,
do_classifier_free_guidance,
lora_scale=text_encoder_lora_scale,
max_embeddings_multiples=max_embeddings_multiples,
- parse_prompt_type=parse_prompt_type, )
+ parse_prompt_type=parse_prompt_type,
+ )
dtype = prompt_embeds.dtype
# 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 6. Prepare latent variables
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
is_strength_max = strength == 1.0
latents, clean_latents = self.prepare_latents(
batch_size * num_images_per_prompt,
@@ -3380,7 +3245,8 @@ def cycle_diffusion(
image=init_image,
timestep=latent_timestep,
is_strength_max=is_strength_max,
- return_image_latents=True, )
+ return_image_latents=True,
+ )
source_latents = latents
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -3388,18 +3254,15 @@ def cycle_diffusion(
generator = extra_step_kwargs.pop("generator", None)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = paddle.concat([latents] * 2)
source_latent_model_input = paddle.concat([source_latents] * 2)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- source_latent_model_input = self.scheduler.scale_model_input(
- source_latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
# predict the noise residual
concat_latent_model_input = paddle.stack(
@@ -3409,7 +3272,8 @@ def cycle_diffusion(
source_latent_model_input[1],
latent_model_input[1],
],
- axis=0, )
+ axis=0,
+ )
concat_prompt_embeds = paddle.stack(
[
source_prompt_embeds[0],
@@ -3417,7 +3281,8 @@ def cycle_diffusion(
source_prompt_embeds[1],
prompt_embeds[1],
],
- axis=0, )
+ axis=0,
+ )
# predict the noise residual
concat_noise_pred = self.unet(
@@ -3425,19 +3290,20 @@ def cycle_diffusion(
timestep=t,
encoder_hidden_states=concat_prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# perform guidance
(
source_noise_pred_uncond,
noise_pred_uncond,
source_noise_pred_text,
- noise_pred_text, ) = concat_noise_pred.chunk(
- 4, axis=0)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_text,
+ ) = concat_noise_pred.chunk(4, axis=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
- source_noise_pred_text - source_noise_pred_uncond)
+ source_noise_pred_text - source_noise_pred_uncond
+ )
# Sample source_latents from the posterior distribution.
prev_source_latents = posterior_sample(
@@ -3446,7 +3312,8 @@ def cycle_diffusion(
t,
clean_latents,
generator=generator,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
# Compute noise.
noise = compute_noise(
self.scheduler,
@@ -3454,29 +3321,24 @@ def cycle_diffusion(
source_latents,
t,
source_noise_pred,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
source_latents = prev_source_latents.cast(dtype)
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- variance_noise=noise,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+ ).prev_sample
latents = latents.cast(dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
- image = self._decode_vae_latents(latents /
- self.vae.config.scaling_factor)
+ image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
image, has_nsfw_concept = self.run_safety_checker(image, dtype)
else:
image = latents
@@ -3487,11 +3349,9 @@ def cycle_diffusion(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/webui_stable_diffusion.py b/ppdiffusers/examples/community/webui_stable_diffusion.py
index cad5739c1f3c5..c5c7cd4c8c0a9 100644
--- a/ppdiffusers/examples/community/webui_stable_diffusion.py
+++ b/ppdiffusers/examples/community/webui_stable_diffusion.py
@@ -25,22 +25,27 @@
import paddle.nn as nn
import PIL
import PIL.Image
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.models import (AutoencoderKL, ControlNetModel,
- UNet2DConditionModel)
+from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
from ppdiffusers.models.controlnet import ControlNetOutput
from ppdiffusers.models.modeling_utils import ModelMixin
from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, PPDIFFUSERS_CACHE, logging,
- ppdiffusers_url_download, randn_tensor,
- safetensors_load, smart_load, torch_load)
+from ppdiffusers.utils import (
+ PIL_INTERPOLATION,
+ PPDIFFUSERS_CACHE,
+ logging,
+ ppdiffusers_url_download,
+ randn_tensor,
+ safetensors_load,
+ smart_load,
+ torch_load,
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -89,8 +94,7 @@ def resize(im, w, h):
resized = resize(im, src_w, src_h)
res = Image.new("RGB", (width, height))
- res.paste(
- resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+ res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
else:
ratio = width / height
@@ -101,31 +105,22 @@ def resize(im, w, h):
resized = resize(im, src_w, src_h)
res = Image.new("RGB", (width, height))
- res.paste(
- resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+ res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
if ratio < src_ratio:
fill_height = height // 2 - src_h // 2
+ res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
res.paste(
- resized.resize(
- (width, fill_height), box=(0, 0, width, 0)),
- box=(0, 0))
- res.paste(
- resized.resize(
- (width, fill_height),
- box=(0, resized.height, width, resized.height)),
- box=(0, fill_height + src_h), )
+ resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+ box=(0, fill_height + src_h),
+ )
elif ratio > src_ratio:
fill_width = width // 2 - src_w // 2
+ res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
res.paste(
- resized.resize(
- (fill_width, height), box=(0, 0, 0, height)),
- box=(0, 0))
- res.paste(
- resized.resize(
- (fill_width, height),
- box=(resized.width, 0, resized.width, height)),
- box=(fill_width + src_w, 0), )
+ resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+ box=(fill_width + src_w, 0),
+ )
return res
@@ -137,8 +132,7 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"):
import requests
headers = {
- "User-Agent":
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"
+ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"
}
r = requests.get(display_url, headers=headers)
soup = bs4.BeautifulSoup(r.text, "lxml")
@@ -151,12 +145,13 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"):
def http_file_name(
- url: str,
- *,
- proxies=None,
- headers: Optional[Dict[str, str]]=None,
- timeout=10.0,
- max_retries=0, ):
+ url: str,
+ *,
+ proxies=None,
+ headers: Optional[Dict[str, str]] = None,
+ timeout=10.0,
+ max_retries=0,
+):
"""
Get a remote file name.
"""
@@ -168,7 +163,8 @@ def http_file_name(
proxies=proxies,
headers=headers,
timeout=timeout,
- max_retries=max_retries, )
+ max_retries=max_retries,
+ )
hf_raise_for_status(r)
displayed_name = url.split("/")[-1]
content_disposition = r.headers.get("Content-Disposition")
@@ -180,11 +176,12 @@ def http_file_name(
@paddle.no_grad()
def load_lora(
- pipeline,
- state_dict: dict,
- LORA_PREFIX_UNET: str="lora_unet",
- LORA_PREFIX_TEXT_ENCODER: str="lora_te",
- ratio: float=1.0, ):
+ pipeline,
+ state_dict: dict,
+ LORA_PREFIX_UNET: str = "lora_unet",
+ LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
+ ratio: float = 1.0,
+):
ratio = float(ratio)
visited = []
for key in state_dict:
@@ -192,8 +189,7 @@ def load_lora(
continue
if "text" in key:
- tmp_layer_infos = (key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER
- + "_")[-1].split("_"))
+ tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
hf_to_ppnlp = {
"encoder": "transformer",
"fc1": "linear1",
@@ -206,8 +202,7 @@ def load_lora(
layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
curr_layer: paddle.nn.Linear = pipeline.text_encoder
else:
- layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[
- -1].split("_")
+ layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
curr_layer: paddle.nn.Linear = pipeline.unet
temp_name = layer_infos.pop(0)
@@ -248,24 +243,29 @@ def load_lora(
if weight_down.shape[2:4] == [1, 1]:
# conv2d 1x1
curr_layer.weight.copy_(
- curr_layer.weight + ratio * paddle.matmul(
- weight_up.squeeze([-1, -2]),
- weight_down.squeeze([-1, -2])).unsqueeze([-1, -2]) *
- scale,
- True, )
+ curr_layer.weight
+ + ratio
+ * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
+ * scale,
+ True,
+ )
else:
# conv2d 3x3
curr_layer.weight.copy_(
- curr_layer.weight + ratio * paddle.nn.functional.conv2d(
- weight_down.transpose([1, 0, 2, 3]),
- weight_up).transpose([1, 0, 2, 3]) * scale,
- True, )
+ curr_layer.weight
+ + ratio
+ * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
+ [1, 0, 2, 3]
+ )
+ * scale,
+ True,
+ )
else:
# linear
curr_layer.weight.copy_(
- curr_layer.weight + ratio * paddle.matmul(
- weight_up, weight_down).T * scale,
- True, )
+ curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale,
+ True,
+ )
# update visited list
visited.extend(triplet_keys)
@@ -285,28 +285,25 @@ class MultiControlNetModel(ModelMixin):
`ControlNetModel` as a list.
"""
- def __init__(
- self,
- controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
super().__init__()
self.nets = nn.LayerList(controlnets)
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- controlnet_cond: List[paddle.Tensor],
- conditioning_scale: List[float],
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- guess_mode: bool=False,
- return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
- for i, (
- image, scale, controlnet
- ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ controlnet_cond: List[paddle.Tensor],
+ conditioning_scale: List[float],
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guess_mode: bool = False,
+ return_dict: bool = True,
+ ) -> Union[ControlNetOutput, Tuple]:
+ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
down_samples, mid_sample = controlnet(
sample,
timestep,
@@ -318,7 +315,8 @@ def forward(
attention_mask,
cross_attention_kwargs,
guess_mode,
- return_dict, )
+ return_dict,
+ )
# merge samples
if i == 0:
@@ -326,8 +324,7 @@ def forward(
else:
down_block_res_samples = [
samples_prev + samples_curr
- for samples_prev, samples_curr in zip(
- down_block_res_samples, down_samples)
+ for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
]
mid_block_res_sample += mid_sample
@@ -373,17 +370,22 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
TI_DIR = os.path.join(PPDIFFUSERS_CACHE, "textual_inversion")
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
- ControlNetModel], MultiControlNetModel, ]=None,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ controlnet: Union[
+ ControlNetModel,
+ List[ControlNetModel],
+ Tuple[ControlNetModel],
+ MultiControlNetModel,
+ ] = None,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -413,8 +415,9 @@ def __init__(
controlnet=controlnet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# custom data
@@ -441,9 +444,9 @@ def __init__(
# register_state_dict_hook to fix text_encoder, when we save_pretrained text model.
def map_to(state_dict, *args, **kwargs):
if "text_model.token_embedding.wrapped.weight" in state_dict:
- state_dict[
- "text_model.token_embedding.weight"] = state_dict.pop(
- "text_model.token_embedding.wrapped.weight")
+ state_dict["text_model.token_embedding.weight"] = state_dict.pop(
+ "text_model.token_embedding.wrapped.weight"
+ )
return state_dict
self.text_encoder.register_state_dict_hook(map_to)
@@ -466,7 +469,8 @@ def download_civitai_lora_file(self, url):
file_path = ppdiffusers_url_download(
download_url,
cache_dir=self.LORA_DIR,
- filename=http_file_name(download_url).strip('"'), )
+ filename=http_file_name(download_url).strip('"'),
+ )
return file_path
def download_civitai_ti_file(self, url):
@@ -479,7 +483,8 @@ def download_civitai_ti_file(self, url):
file_path = ppdiffusers_url_download(
download_url,
cache_dir=self.TI_DIR,
- filename=http_file_name(download_url).strip('"'), )
+ filename=http_file_name(download_url).strip('"'),
+ )
return file_path
def change_scheduler(self, scheduler_type="ddim"):
@@ -488,55 +493,56 @@ def change_scheduler(self, scheduler_type="ddim"):
def switch_scheduler(self, scheduler_type="ddim"):
scheduler_type = scheduler_type.lower()
from ppdiffusers import (
- DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
- DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
- KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
- UniPCMultistepScheduler)
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UniPCMultistepScheduler,
+ )
if scheduler_type == "pndm":
- scheduler = PNDMScheduler.from_config(
- self.orginal_scheduler_config, skip_prk_steps=True)
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "heun":
- scheduler = HeunDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler":
- scheduler = EulerDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-multi":
- scheduler = DPMSolverMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-single":
- scheduler = DPMSolverSinglestepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2-ancestral":
- scheduler = KDPM2AncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2":
- scheduler = KDPM2DiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "unipc-multi":
- scheduler = UniPCMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "ddim":
scheduler = DDIMScheduler.from_config(
self.orginal_scheduler_config,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
elif scheduler_type == "ddpm":
- scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
- )
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
elif scheduler_type == "deis-multi":
scheduler = DEISMultistepScheduler.from_config(
- self.orginal_scheduler_config, )
+ self.orginal_scheduler_config,
+ )
else:
raise ValueError(
f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -545,30 +551,28 @@ def switch_scheduler(self, scheduler_type="ddim"):
@paddle.no_grad()
def _encode_prompt(
- self,
- prompt: str,
- do_classifier_free_guidance: float=7.5,
- negative_prompt: str=None,
- num_inference_steps: int=50, ):
+ self,
+ prompt: str,
+ do_classifier_free_guidance: float = 7.5,
+ negative_prompt: str = None,
+ num_inference_steps: int = 50,
+ ):
if do_classifier_free_guidance:
assert isinstance(negative_prompt, str)
negative_prompt = [negative_prompt]
- uc = get_learned_conditioning(self.sj.clip, negative_prompt,
- num_inference_steps)
+ uc = get_learned_conditioning(self.sj.clip, negative_prompt, num_inference_steps)
else:
uc = None
- c = get_multicond_learned_conditioning(self.sj.clip, prompt,
- num_inference_steps)
+ c = get_multicond_learned_conditioning(self.sj.clip, prompt, num_inference_steps)
return c, uc
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -587,48 +591,43 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- controlnet_conditioning_scale=1.0, ):
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ controlnet_conditioning_scale=1.0,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and not isinstance(prompt, str):
- raise ValueError(
- f"`prompt` has to be of type `str` but is {type(prompt)}")
+ raise ValueError(f"`prompt` has to be of type `str` but is {type(prompt)}")
if negative_prompt is not None and not isinstance(negative_prompt, str):
- raise ValueError(
- f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}"
- )
+ raise ValueError(f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}")
# `prompt` needs more sophisticated handling when there are multiple
# conditionings.
@@ -645,15 +644,12 @@ def check_inputs(
self.check_image(image, prompt)
elif isinstance(self.controlnet, MultiControlNetModel):
if not isinstance(image, list):
- raise TypeError(
- "For multiple controlnets: `image` must be type `list`")
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
# When `image` is a nested list:
# (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
elif any(isinstance(i, list) for i in image):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
elif len(image) != len(self.controlnet.nets):
raise ValueError(
"For multiple controlnets: `image` must have the same length as the number of controlnets."
@@ -666,39 +662,31 @@ def check_inputs(
# Check `controlnet_conditioning_scale`
if isinstance(self.controlnet, ControlNetModel):
- if not isinstance(controlnet_conditioning_scale,
- (float, list, tuple)):
+ if not isinstance(controlnet_conditioning_scale, (float, list, tuple)):
raise TypeError(
"For single controlnet: `controlnet_conditioning_scale` must be type `float, list(float) or tuple(float)`."
)
elif isinstance(self.controlnet, MultiControlNetModel):
if isinstance(controlnet_conditioning_scale, list):
- if any(
- isinstance(i, list)
- for i in controlnet_conditioning_scale):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
- elif isinstance(
- controlnet_conditioning_scale,
- list) and len(controlnet_conditioning_scale) != len(
- self.controlnet.nets):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
raise ValueError(
"For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
- " the same length as the number of controlnets")
+ " the same length as the number of controlnets"
+ )
else:
assert False
def check_image(self, image, prompt):
image_is_pil = isinstance(image, PIL.Image.Image)
image_is_tensor = isinstance(image, paddle.Tensor)
- image_is_pil_list = isinstance(image, list) and isinstance(
- image[0], PIL.Image.Image)
- image_is_tensor_list = isinstance(image, list) and isinstance(
- image[0], paddle.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
- if (not image_is_pil and not image_is_tensor and
- not image_is_pil_list and not image_is_tensor_list):
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
raise TypeError(
"image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
)
@@ -725,27 +713,16 @@ def check_image(self, image, prompt):
def prepare_image(self, image, width, height, dtype, resize_mode=-1):
if not isinstance(image, paddle.Tensor):
if isinstance(image, PIL.Image.Image):
- image = resize_image(
- resize_mode=resize_mode,
- im=image,
- width=width,
- height=height)
+ image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
image = [image]
if isinstance(image[0], PIL.Image.Image):
- image = [
- resize_image(
- resize_mode=resize_mode,
- im=im,
- width=width,
- height=height) for im in image
- ]
+ image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
images = []
for image_ in image:
image_ = image_.convert("RGB")
- image_ = image_.resize(
- (width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
image_ = np.array(image_)
image_ = image_[None, :]
images.append(image_)
@@ -761,14 +738,15 @@ def prepare_image(self, image, width, height, dtype, resize_mode=-1):
return image
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -812,31 +790,31 @@ def _default_height_width(self, height, width, image):
@paddle.no_grad()
def __call__(
- self,
- prompt: str=None,
- image: PIL.Image.Image=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: str=None,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- clip_skip: int=1,
- controlnet_conditioning_scale: Union[float, List[float]]=1.0,
- enable_lora: bool=True,
- resize_mode: int=0,
- # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
- # 0 1 2 -1
- starting_control_step: float=0.0,
- ending_control_step: float=1.0, ):
+ self,
+ prompt: str = None,
+ image: PIL.Image.Image = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: str = None,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ clip_skip: int = 1,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+ enable_lora: bool = True,
+ resize_mode: int = 0,
+ # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
+ # 0 1 2 -1
+ starting_control_step: float = 0.0,
+ ending_control_step: float = 1.0,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -914,17 +892,16 @@ def __call__(
# 0. Default height and width to unet
if enable_control:
if isinstance(self.controlnet, ControlNetModel):
- height, width = self._default_height_width(height, width,
- image)
+ height, width = self._default_height_width(height, width, image)
image = self.prepare_image(
image=image,
width=width,
height=height,
dtype=self.controlnet.dtype,
- resize_mode=resize_mode, )
+ resize_mode=resize_mode,
+ )
elif isinstance(self.controlnet, MultiControlNetModel):
- height, width = self._default_height_width(height, width,
- image)
+ height, width = self._default_height_width(height, width, image)
images = []
for image_ in image:
@@ -933,16 +910,15 @@ def __call__(
width=width,
height=height,
dtype=self.controlnet.dtype,
- resize_mode=resize_mode, )
+ resize_mode=resize_mode,
+ )
images.append(image_)
image = images
else:
- height = height or max(self.unet.config.sample_size *
- self.vae_scale_factor, 512)
- width = width or max(self.unet.config.sample_size *
- self.vae_scale_factor, 512)
+ height = height or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
+ width = width or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
# 1. Check inputs. Raise error if not correct
self.check_inputs(
@@ -952,7 +928,8 @@ def __call__(
width,
callback_steps,
negative_prompt,
- controlnet_conditioning_scale, )
+ controlnet_conditioning_scale,
+ )
# 2. Define call parameters
batch_size = 1
@@ -966,47 +943,34 @@ def __call__(
if enable_lora and self.LORA_DIR is not None:
if os.path.exists(self.LORA_DIR):
- lora_mapping = {
- p.stem: p.absolute()
- for p in Path(self.LORA_DIR).glob("*.safetensors")
- }
+ lora_mapping = {p.stem: p.absolute() for p in Path(self.LORA_DIR).glob("*.safetensors")}
for params in extra_network_data["lora"]:
assert len(params.items) > 0
name = params.items[0]
if name in lora_mapping:
- ratio = (float(params.items[1])
- if len(params.items) > 1 else 1.0)
- lora_state_dict = smart_load(
- lora_mapping[name],
- map_location=paddle.get_device())
+ ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
+ lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
self.weights_has_changed = True
- load_lora(
- self, state_dict=lora_state_dict, ratio=ratio)
+ load_lora(self, state_dict=lora_state_dict, ratio=ratio)
del lora_state_dict
else:
- print(
- f"We can't find lora weight: {name}! Please make sure that exists!"
- )
+ print(f"We can't find lora weight: {name}! Please make sure that exists!")
else:
if len(extra_network_data["lora"]) > 0:
- print(
- f"{self.LORA_DIR} not exists, so we cant load loras!"
- )
+ print(f"{self.LORA_DIR} not exists, so we cant load loras!")
self.sj.clip.CLIP_stop_at_last_layers = clip_skip
- if isinstance(self.controlnet, MultiControlNetModel) and isinstance(
- controlnet_conditioning_scale, float):
- controlnet_conditioning_scale = [
- controlnet_conditioning_scale
- ] * len(self.controlnet.nets)
+ if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
# 3. Encode input prompt
prompt_embeds, negative_prompt_embeds = self._encode_prompt(
prompts,
do_classifier_free_guidance,
negative_prompt,
- num_inference_steps=num_inference_steps, )
+ num_inference_steps=num_inference_steps,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -1021,127 +985,107 @@ def __call__(
width,
self.unet.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = (
- len(timesteps) - num_inference_steps * self.scheduler.order)
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
current_control_step = i / len(timesteps)
step = i // self.scheduler.order
do_batch = False
- conds_list, cond_tensor = reconstruct_multicond_batch(
- prompt_embeds, step)
+ conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
try:
weight = conds_list[0][0][1]
except Exception:
weight = 1.0
if do_classifier_free_guidance:
- uncond_tensor = reconstruct_cond_batch(
- negative_prompt_embeds, step)
- do_batch = cond_tensor.shape[1] == uncond_tensor.shape[
- 1] and not isinstance(self.controlnet,
- MultiControlNetModel)
+ uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
+ do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1] and not isinstance(
+ self.controlnet, MultiControlNetModel
+ )
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_batch else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
if do_batch:
- encoder_hidden_states = paddle.concat(
- [uncond_tensor, cond_tensor])
+ encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
control_kwargs = {}
- if (enable_control and starting_control_step <
- current_control_step < ending_control_step):
- (
- down_block_res_samples,
- mid_block_res_sample,
- ) = self.controlnet(
+ if enable_control and starting_control_step < current_control_step < ending_control_step:
+ (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
latent_model_input,
t,
encoder_hidden_states=encoder_hidden_states,
controlnet_cond=paddle.concat([image, image]),
conditioning_scale=controlnet_conditioning_scale,
- return_dict=False, )
- control_kwargs[
- "down_block_additional_residuals"] = down_block_res_samples
- control_kwargs[
- "mid_block_additional_residual"] = mid_block_res_sample
+ return_dict=False,
+ )
+ control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+ control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- **control_kwargs, ).sample
+ **control_kwargs,
+ ).sample
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + weight * guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_text - noise_pred_uncond
+ )
else:
control_kwargs = {}
- if (enable_control and starting_control_step <
- current_control_step < ending_control_step):
- (
- down_block_res_samples,
- mid_block_res_sample,
- ) = self.controlnet(
+ if enable_control and starting_control_step < current_control_step < ending_control_step:
+ (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
latent_model_input,
t,
encoder_hidden_states=cond_tensor,
controlnet_cond=image,
conditioning_scale=controlnet_conditioning_scale,
- return_dict=False, )
- control_kwargs[
- "down_block_additional_residuals"] = down_block_res_samples
- control_kwargs[
- "mid_block_additional_residual"] = mid_block_res_sample
+ return_dict=False,
+ )
+ control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+ control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=cond_tensor,
cross_attention_kwargs=cross_attention_kwargs,
- **control_kwargs, ).sample
+ **control_kwargs,
+ ).sample
if do_classifier_free_guidance:
control_kwargs = {}
- if (enable_control and starting_control_step <
- current_control_step < ending_control_step):
- (
- down_block_res_samples,
- mid_block_res_sample,
- ) = self.controlnet(
+ if enable_control and starting_control_step < current_control_step < ending_control_step:
+ (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
latent_model_input,
t,
encoder_hidden_states=uncond_tensor,
controlnet_cond=image,
conditioning_scale=controlnet_conditioning_scale,
- return_dict=False, )
- control_kwargs[
- "down_block_additional_residuals"] = down_block_res_samples
- control_kwargs[
- "mid_block_additional_residual"] = mid_block_res_sample
+ return_dict=False,
+ )
+ control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+ control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
noise_pred_uncond = self.unet(
latent_model_input,
t,
encoder_hidden_states=uncond_tensor,
cross_attention_kwargs=cross_attention_kwargs,
- **control_kwargs, ).sample
- noise_pred = noise_pred_uncond + weight * guidance_scale * (
- noise_pred - noise_pred_uncond)
+ **control_kwargs,
+ ).sample
+ noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -1154,8 +1098,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, self.unet.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
@@ -1164,14 +1107,12 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, self.unet.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
except Exception as e:
raise ValueError(e)
finally:
@@ -1215,12 +1156,7 @@ class FrozenCLIPEmbedder(nn.Layer):
LAYERS = ["last", "pooled", "hidden"]
- def __init__(self,
- text_encoder,
- tokenizer,
- freeze=True,
- layer="last",
- layer_idx=None):
+ def __init__(self, text_encoder, tokenizer, freeze=True, layer="last", layer_idx=None):
super().__init__()
assert layer in self.LAYERS
self.tokenizer = tokenizer
@@ -1244,12 +1180,14 @@ def forward(self, text):
truncation=True,
max_length=self.tokenizer.model_max_length,
padding="max_length",
- return_tensors="pd", )
+ return_tensors="pd",
+ )
tokens = batch_encoding["input_ids"]
outputs = self.text_encoder(
input_ids=tokens,
output_hidden_states=self.layer == "hidden",
- return_dict=True, )
+ return_dict=True,
+ )
if self.layer == "last":
z = outputs.last_hidden_state
elif self.layer == "pooled":
@@ -1288,8 +1226,7 @@ def empty_chunk(self):
def get_target_prompt_token_count(self, token_count):
"""returns the maximum number of tokens a prompt of a known length can have before it requires one more PromptChunk to be represented"""
- return math.ceil(max(token_count, 1) /
- self.chunk_length) * self.chunk_length
+ return math.ceil(max(token_count, 1) / self.chunk_length) * self.chunk_length
def tokenize(self, texts):
"""Converts a batch of texts into a batch of token ids"""
@@ -1370,10 +1307,12 @@ def next_chunk(is_last=False):
# this is when we are at the end of alloted 75 tokens for the current chunk, and the current token is not a comma. opts.comma_padding_backtrack
# is a setting that specifies that if there is a comma nearby, the text after the comma should be moved out of this chunk and into the next.
- elif (WebUIStableDiffusionPipeline.comma_padding_backtrack != 0
- and len(chunk.tokens) == self.chunk_length and
- last_comma != -1 and len(chunk.tokens) - last_comma <=
- WebUIStableDiffusionPipeline.comma_padding_backtrack):
+ elif (
+ WebUIStableDiffusionPipeline.comma_padding_backtrack != 0
+ and len(chunk.tokens) == self.chunk_length
+ and last_comma != -1
+ and len(chunk.tokens) - last_comma <= WebUIStableDiffusionPipeline.comma_padding_backtrack
+ ):
break_location = last_comma + 1
reloc_tokens = chunk.tokens[break_location:]
@@ -1392,8 +1331,7 @@ def next_chunk(is_last=False):
(
embedding,
embedding_length_in_tokens,
- ) = self.hijack.embedding_db.find_embedding_at_position(
- tokens, position)
+ ) = self.hijack.embedding_db.find_embedding_at_position(tokens, position)
if embedding is None:
chunk.tokens.append(token)
chunk.multipliers.append(weight)
@@ -1455,10 +1393,7 @@ def forward(self, texts):
zs = []
for i in range(chunk_count):
- batch_chunk = [
- chunks[i] if i < len(chunks) else self.empty_chunk()
- for chunks in batch_chunks
- ]
+ batch_chunk = [chunks[i] if i < len(chunks) else self.empty_chunk() for chunks in batch_chunks]
tokens = [x.tokens for x in batch_chunk]
multipliers = [x.multipliers for x in batch_chunk]
@@ -1472,10 +1407,9 @@ def forward(self, texts):
zs.append(z)
if len(used_embeddings) > 0:
- embeddings_list = ", ".join([
- f"{name} [{embedding.checksum()}]"
- for name, embedding in used_embeddings.items()
- ])
+ embeddings_list = ", ".join(
+ [f"{name} [{embedding.checksum()}]" for name, embedding in used_embeddings.items()]
+ )
self.hijack.comments.append(f"Used embeddings: {embeddings_list}")
return paddle.concat(zs, axis=1)
@@ -1494,15 +1428,19 @@ def process_tokens(self, remade_batch_tokens, batch_multipliers):
if self.id_end != self.id_pad:
for batch_pos in range(len(remade_batch_tokens)):
index = remade_batch_tokens[batch_pos].index(self.id_end)
- tokens[batch_pos, index + 1:tokens.shape[1]] = self.id_pad
+ tokens[batch_pos, index + 1 : tokens.shape[1]] = self.id_pad
z = self.encode_with_text_encoder(tokens)
# restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
batch_multipliers = paddle.to_tensor(batch_multipliers)
original_mean = z.mean()
- z = z * batch_multipliers.reshape(batch_multipliers.shape +
- [1, ]).expand(z.shape)
+ z = z * batch_multipliers.reshape(
+ batch_multipliers.shape
+ + [
+ 1,
+ ]
+ ).expand(z.shape)
new_mean = z.mean()
z = z * (original_mean / new_mean)
@@ -1520,8 +1458,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1):
self.comma_token = vocab.get(",", None)
self.token_mults = {}
- tokens_with_parens = [(k, v) for k, v in vocab.items()
- if "(" in k or ")" in k or "[" in k or "]" in k]
+ tokens_with_parens = [(k, v) for k, v in vocab.items() if "(" in k or ")" in k or "[" in k or "]" in k]
for text, ident in tokens_with_parens:
mult = 1.0
for c in text:
@@ -1542,8 +1479,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1):
self.id_pad = self.id_end
def tokenize(self, texts):
- tokenized = self.wrapped.tokenizer(
- texts, truncation=False, add_special_tokens=False)["input_ids"]
+ tokenized = self.wrapped.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
return tokenized
@@ -1552,7 +1488,8 @@ def encode_with_text_encoder(self, tokens):
outputs = self.wrapped.text_encoder(
input_ids=tokens,
output_hidden_states=output_hidden_states,
- return_dict=True, )
+ return_dict=True,
+ )
if output_hidden_states:
z = outputs.hidden_states[-self.CLIP_stop_at_last_layers]
@@ -1564,11 +1501,9 @@ def encode_with_text_encoder(self, tokens):
def encode_embedding_init_text(self, init_text, nvpt):
embedding_layer = self.wrapped.text_encoder.text_model
- ids = self.wrapped.tokenizer(
- init_text,
- max_length=nvpt,
- return_tensors="pd",
- add_special_tokens=False)["input_ids"]
+ ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pd", add_special_tokens=False)[
+ "input_ids"
+ ]
embedded = embedding_layer.token_embedding.wrapped(ids).squeeze(0)
return embedded
@@ -1630,8 +1565,7 @@ def parse_prompts(prompts):
class EmbeddingDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
- json.JSONDecoder.__init__(
- self, object_hook=self.object_hook, *args, **kwargs)
+ json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
def object_hook(self, d):
if "TORCHTENSOR" in d:
@@ -1652,8 +1586,7 @@ def lcg(m=2**32, a=1664525, c=1013904223, seed=0):
def xor_block(block):
g = lcg()
- randblock = (np.array([next(g) for _ in range(np.product(block.shape))])
- .astype(np.uint8).reshape(block.shape))
+ randblock = np.array([next(g) for _ in range(np.product(block.shape))]).astype(np.uint8).reshape(block.shape)
return np.bitwise_xor(block.astype(np.uint8), randblock & 0x0F)
@@ -1667,16 +1600,17 @@ def crop_black(img, tol=0):
def extract_image_data_embed(image):
d = 3
- outarr = (crop_black(
- np.array(image.convert("RGB").getdata())
- .reshape(image.size[1], image.size[0], d).astype(np.uint8)) & 0x0F)
+ outarr = (
+ crop_black(np.array(image.convert("RGB").getdata()).reshape(image.size[1], image.size[0], d).astype(np.uint8))
+ & 0x0F
+ )
black_cols = np.where(np.sum(outarr, axis=(0, 2)) == 0)
if black_cols[0].shape[0] < 2:
print("No Image data blocks found.")
return None
- data_block_lower = outarr[:, :black_cols[0].min(), :].astype(np.uint8)
- data_block_upper = outarr[:, black_cols[0].max() + 1:, :].astype(np.uint8)
+ data_block_lower = outarr[:, : black_cols[0].min(), :].astype(np.uint8)
+ data_block_upper = outarr[:, black_cols[0].max() + 1 :, :].astype(np.uint8)
data_block_lower = xor_block(data_block_lower)
data_block_upper = xor_block(data_block_upper)
@@ -1703,7 +1637,8 @@ def extract_image_data_embed(image):
# [75, 'fantasy landscape with a lake and an oak in background masterful']
# [100, 'fantasy landscape with a lake and a christmas tree in background masterful']
-schedule_parser = lark.Lark(r"""
+schedule_parser = lark.Lark(
+ r"""
!start: (prompt | /[][():]/+)*
prompt: (emphasized | scheduled | alternate | plain | WHITESPACE)*
!emphasized: "(" prompt ")"
@@ -1714,7 +1649,8 @@ def extract_image_data_embed(image):
WHITESPACE: /\s+/
plain: /([^\\\[\]():|]|\\.)+/
%import common.SIGNED_NUMBER -> NUMBER
-""")
+"""
+)
def get_learned_conditioning_prompt_schedules(prompts, steps):
@@ -1806,8 +1742,7 @@ def get_schedule(prompt):
return [promptdict[prompt] for prompt in prompts]
-ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning",
- ["end_at_step", "cond"])
+ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", ["end_at_step", "cond"])
def get_learned_conditioning(model, prompts, steps):
@@ -1845,8 +1780,7 @@ def get_learned_conditioning(model, prompts, steps):
cond_schedule = []
for i, (end_at_step, text) in enumerate(prompt_schedule):
- cond_schedule.append(
- ScheduledPromptConditioning(end_at_step, conds[i]))
+ cond_schedule.append(ScheduledPromptConditioning(end_at_step, conds[i]))
cache[prompt] = cond_schedule
res.append(cond_schedule)
@@ -1871,8 +1805,7 @@ def get_multicond_prompt_list(prompts):
for subprompt in subprompts:
match = re_weight.search(subprompt)
- text, weight = match.groups() if match is not None else (subprompt,
- 1.0)
+ text, weight = match.groups() if match is not None else (subprompt, 1.0)
weight = float(weight) if weight is not None else 1.0
@@ -1897,43 +1830,37 @@ def __init__(self, schedules, weight=1.0):
class MulticondLearnedConditioning:
def __init__(self, shape, batch):
- self.shape: tuple = (
- shape # the shape field is needed to send this object to DDIM/PLMS
- )
+ self.shape: tuple = shape # the shape field is needed to send this object to DDIM/PLMS
self.batch: List[List[ComposableScheduledPromptConditioning]] = batch
-def get_multicond_learned_conditioning(model, prompts,
- steps) -> MulticondLearnedConditioning:
+def get_multicond_learned_conditioning(model, prompts, steps) -> MulticondLearnedConditioning:
"""same as get_learned_conditioning, but returns a list of ScheduledPromptConditioning along with the weight objects for each prompt.
For each prompt, the list is obtained by splitting the prompt using the AND separator.
https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/
"""
- res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(
- prompts)
+ res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(prompts)
- learned_conditioning = get_learned_conditioning(model, prompt_flat_list,
- steps)
+ learned_conditioning = get_learned_conditioning(model, prompt_flat_list, steps)
res = []
for indexes in res_indexes:
- res.append([
- ComposableScheduledPromptConditioning(learned_conditioning[i],
- weight)
- for i, weight in indexes
- ])
+ res.append([ComposableScheduledPromptConditioning(learned_conditioning[i], weight) for i, weight in indexes])
- return MulticondLearnedConditioning(shape=(len(prompts), ), batch=res)
+ return MulticondLearnedConditioning(shape=(len(prompts),), batch=res)
-def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]],
- current_step):
+def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], current_step):
param = c[0][0].cond
res = paddle.zeros(
- [len(c), ] + param.shape,
- dtype=param.dtype, )
+ [
+ len(c),
+ ]
+ + param.shape,
+ dtype=param.dtype,
+ )
for i, cond_schedule in enumerate(c):
target_index = 0
for current, (end_at, cond) in enumerate(cond_schedule):
@@ -1956,8 +1883,7 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
for cond_index, composable_prompt in enumerate(composable_prompts):
target_index = 0
- for current, (end_at,
- cond) in enumerate(composable_prompt.schedules):
+ for current, (end_at, cond) in enumerate(composable_prompt.schedules):
if current_step <= end_at:
target_index = current
break
@@ -1973,10 +1899,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
for i in range(len(tensors)):
if tensors[i].shape[0] != token_count:
last_vector = tensors[i][-1:]
- last_vector_repeated = last_vector.tile(
- [token_count - tensors[i].shape[0], 1])
- tensors[i] = paddle.concat(
- [tensors[i], last_vector_repeated], axis=0)
+ last_vector_repeated = last_vector.tile([token_count - tensors[i].shape[0], 1])
+ tensors[i] = paddle.concat([tensors[i], last_vector_repeated], axis=0)
return conds_list, paddle.stack(tensors).cast(dtype=param.dtype)
@@ -1997,7 +1921,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
[^\\()\[\]:]+|
:
""",
- re.X, )
+ re.X,
+)
re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
@@ -2102,15 +2027,12 @@ class StableDiffusionModelHijack:
layers = None
circular_enabled = False
- def __init__(self,
- clip_model,
- embeddings_dir=None,
- CLIP_stop_at_last_layers=-1):
+ def __init__(self, clip_model, embeddings_dir=None, CLIP_stop_at_last_layers=-1):
model_embeddings = clip_model.text_encoder.text_model
- model_embeddings.token_embedding = EmbeddingsWithFixes(
- model_embeddings.token_embedding, self)
+ model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
clip_model = FrozenCLIPEmbedderWithCustomWords(
- clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers)
+ clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers
+ )
self.embedding_db = EmbeddingDatabase(clip_model)
self.embedding_db.add_embedding_dir(embeddings_dir)
@@ -2148,8 +2070,7 @@ def forward(self, input_ids):
inputs_embeds = self.wrapped(input_ids)
- if (batch_fixes is None or len(batch_fixes) == 0 or
- max([len(x) for x in batch_fixes]) == 0):
+ if batch_fixes is None or len(batch_fixes) == 0 or max([len(x) for x in batch_fixes]) == 0:
return inputs_embeds
vecs = []
@@ -2157,11 +2078,13 @@ def forward(self, input_ids):
for offset, embedding in fixes:
emb = embedding.vec.cast(self.wrapped.dtype)
emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0])
- tensor = paddle.concat([
- tensor[0:offset + 1],
- emb[0:emb_len],
- tensor[offset + 1 + emb_len:],
- ])
+ tensor = paddle.concat(
+ [
+ tensor[0 : offset + 1],
+ emb[0:emb_len],
+ tensor[offset + 1 + emb_len :],
+ ]
+ )
vecs.append(tensor)
@@ -2190,12 +2113,8 @@ def __init__(self, vec, name, step=None):
def save(self, filename):
embedding_data = {
- "string_to_token": {
- "*": 265
- },
- "string_to_param": {
- "*": self.vec
- },
+ "string_to_token": {"*": 265},
+ "string_to_param": {"*": self.vec},
"name": self.name,
"step": self.step,
"sd_checkpoint": self.sd_checkpoint,
@@ -2267,7 +2186,8 @@ def register_embedding(self, embedding, model):
self.ids_lookup[first_id] = sorted(
self.ids_lookup[first_id] + [(ids, embedding)],
key=lambda x: len(x[0]),
- reverse=True, )
+ reverse=True,
+ )
return embedding
@@ -2285,8 +2205,7 @@ def load_from_file(self, path, filename):
return
embed_image = Image.open(path)
- if hasattr(embed_image,
- "text") and "sd-ti-embedding" in embed_image.text:
+ if hasattr(embed_image, "text") and "sd-ti-embedding" in embed_image.text:
data = embedding_from_b64(embed_image.text["sd-ti-embedding"])
name = data.get("name", name)
else:
@@ -2308,14 +2227,11 @@ def load_from_file(self, path, filename):
param_dict = data["string_to_param"]
if hasattr(param_dict, "_parameters"):
param_dict = getattr(param_dict, "_parameters")
- assert len(
- param_dict) == 1, "embedding file has multiple terms in it"
+ assert len(param_dict) == 1, "embedding file has multiple terms in it"
emb = next(iter(param_dict.items()))[1]
# diffuser concepts
- elif type(data) == dict and type(next(iter(data.values(
- )))) == paddle.Tensor:
- assert len(data.keys(
- )) == 1, "embedding file has multiple terms in it"
+ elif type(data) == dict and type(next(iter(data.values()))) == paddle.Tensor:
+ assert len(data.keys()) == 1, "embedding file has multiple terms in it"
emb = next(iter(data.values()))
if len(emb.shape) == 1:
@@ -2387,7 +2303,8 @@ def load_textual_inversion_embeddings(self, force_reload=False):
displayed_embeddings = (
tuple(self.word_embeddings.keys()),
- tuple(self.skipped_embeddings.keys()), )
+ tuple(self.skipped_embeddings.keys()),
+ )
if self.previously_displayed_embeddings != displayed_embeddings:
self.previously_displayed_embeddings = displayed_embeddings
print(
@@ -2406,7 +2323,7 @@ def find_embedding_at_position(self, tokens, offset):
return None, None
for ids, embedding in possible_matches:
- if tokens[offset:offset + len(ids)] == ids:
+ if tokens[offset : offset + len(ids)] == ids:
return embedding, len(ids)
return None, None
diff --git a/ppdiffusers/examples/community/wildcard_stable_diffusion.py b/ppdiffusers/examples/community/wildcard_stable_diffusion.py
index 80eb36c2a700c..93ad2d40a130a 100644
--- a/ppdiffusers/examples/community/wildcard_stable_diffusion.py
+++ b/ppdiffusers/examples/community/wildcard_stable_diffusion.py
@@ -21,18 +21,18 @@
from typing import Callable, Dict, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \
- StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+ StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ppdiffusers.utils import deprecate, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -50,8 +50,7 @@ def read_wildcard_values(path: str):
return f.read().splitlines()
-def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={},
- wildcard_files: List[str]=[]):
+def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
for wildcard_file in wildcard_files:
filename = get_filename(wildcard_file)
read_values = read_wildcard_values(wildcard_file)
@@ -62,19 +61,18 @@ def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={},
def replace_prompt_with_wildcards(
- prompt: str,
- wildcard_option_dict: Dict[str, List[str]]={},
- wildcard_files: List[str]=[], ):
+ prompt: str,
+ wildcard_option_dict: Dict[str, List[str]] = {},
+ wildcard_files: List[str] = [],
+):
new_prompt = prompt
# get wildcard options
- wildcard_option_dict = grab_wildcard_values(wildcard_option_dict,
- wildcard_files)
+ wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
for m in global_re_wildcard.finditer(new_prompt):
wildcard_value = m.group()
- replace_value = random.choice(wildcard_option_dict[wildcard_value.strip(
- "__")])
+ replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
return new_prompt
@@ -125,31 +123,27 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
@@ -171,29 +165,31 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- height: int=512,
- width: int=512,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- seed: Optional[int]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- wildcard_option_dict: Dict[str, List[str]]={},
- wildcard_files: List[str]=[],
- num_prompt_samples: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ seed: Optional[int] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ wildcard_option_dict: Dict[str, List[str]] = {},
+ wildcard_files: List[str] = [],
+ num_prompt_samples: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
Args:
@@ -254,8 +250,7 @@ def __call__(
if isinstance(prompt, str):
prompt = [
- replace_prompt_with_wildcards(prompt, wildcard_option_dict,
- wildcard_files)
+ replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
for i in range(num_prompt_samples)
]
batch_size = len(prompt)
@@ -263,52 +258,46 @@ def __call__(
prompt_list = []
for p in prompt:
for i in range(num_prompt_samples):
- prompt_list.append(
- replace_prompt_with_wildcards(p, wildcard_option_dict,
- wildcard_files))
+ prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
prompt = prompt_list
batch_size = len(prompt)
else:
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# get prompt text embeddings
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
attention_mask = paddle.ones_like(text_input_ids)
- text_embeddings = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)[0]
+ text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -322,14 +311,16 @@ def __call__(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -339,23 +330,20 @@ def __call__(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = paddle.ones_like(uncond_input.input_ids)
- uncond_embeddings = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)[0]
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [batch_size, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
# get the initial random noise unless the user supplied it
@@ -375,9 +363,7 @@ def __call__(
latents = paddle.randn(latents_shape, dtype=latents_dtype)
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
latents = latents
# set timesteps
@@ -394,33 +380,26 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -435,12 +414,11 @@ def __call__(
image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.astype(
- text_embeddings.dtype), )
+ clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+ )
else:
has_nsfw_concept = None
@@ -450,7 +428,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return WildcardStableDiffusionOutput(
- images=image,
- nsfw_content_detected=has_nsfw_concept,
- prompts=prompt)
+ return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)
diff --git a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
index bd00e8dcc89f6..2088a37dbd9a5 100644
--- a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
@@ -27,133 +27,60 @@ def __init__(self, model_path=None):
super().__init__()
self.netVggOne = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=3,
- out_channels=64,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.Conv2D(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=64,
- out_channels=64,
- kernel_size=3,
- stride=1,
- padding=1),
- paddle.nn.ReLU(), )
+ paddle.nn.Conv2D(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+ paddle.nn.ReLU(),
+ )
self.netVggTwo = paddle.nn.Sequential(
- paddle.nn.MaxPool2D(
- kernel_size=2, stride=2),
- paddle.nn.Conv2D(
- in_channels=64,
- out_channels=128,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+ paddle.nn.Conv2D(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=128,
- out_channels=128,
- kernel_size=3,
- stride=1,
- padding=1),
- paddle.nn.ReLU(), )
+ )
self.netVggThr = paddle.nn.Sequential(
- paddle.nn.MaxPool2D(
- kernel_size=2, stride=2),
- paddle.nn.Conv2D(
- in_channels=128,
- out_channels=256,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+ paddle.nn.Conv2D(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=256,
- out_channels=256,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=256,
- out_channels=256,
- kernel_size=3,
- stride=1,
- padding=1),
- paddle.nn.ReLU(), )
+ )
self.netVggFou = paddle.nn.Sequential(
- paddle.nn.MaxPool2D(
- kernel_size=2, stride=2),
- paddle.nn.Conv2D(
- in_channels=256,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+ paddle.nn.Conv2D(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=512,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=512,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
- paddle.nn.ReLU(), )
+ paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+ paddle.nn.ReLU(),
+ )
self.netVggFiv = paddle.nn.Sequential(
- paddle.nn.MaxPool2D(
- kernel_size=2, stride=2),
- paddle.nn.Conv2D(
- in_channels=512,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+ paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=512,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
+ paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
paddle.nn.ReLU(),
- paddle.nn.Conv2D(
- in_channels=512,
- out_channels=512,
- kernel_size=3,
- stride=1,
- padding=1),
- paddle.nn.ReLU(), )
-
- self.netScoreOne = paddle.nn.Conv2D(
- in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
- self.netScoreTwo = paddle.nn.Conv2D(
- in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
- self.netScoreThr = paddle.nn.Conv2D(
- in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
- self.netScoreFou = paddle.nn.Conv2D(
- in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
- self.netScoreFiv = paddle.nn.Conv2D(
- in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+ )
+
+ self.netScoreOne = paddle.nn.Conv2D(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
+ self.netScoreTwo = paddle.nn.Conv2D(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
+ self.netScoreThr = paddle.nn.Conv2D(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
+ self.netScoreFou = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+ self.netScoreFiv = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netCombine = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=5,
- out_channels=1,
- kernel_size=1,
- stride=1,
- padding=0),
- paddle.nn.Sigmoid(), )
+ paddle.nn.Conv2D(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
+ paddle.nn.Sigmoid(),
+ )
if model_path:
self.set_state_dict(paddle.load(model_path))
@@ -162,7 +89,8 @@ def forward(self, tenInput):
tenInput = tenInput * 255.0
tenInput = tenInput - paddle.to_tensor(
[104.00698793, 116.66876762, 122.67891434],
- dtype=tenInput.dtype, ).reshape([1, 3, 1, 1])
+ dtype=tenInput.dtype,
+ ).reshape([1, 3, 1, 1])
tenVggOne = self.netVggOne(tenInput)
tenVggTwo = self.netVggTwo(tenVggOne)
@@ -180,47 +108,48 @@ def forward(self, tenInput):
tenScoreOne,
size=(tenInput.shape[2], tenInput.shape[3]),
mode="bilinear",
- align_corners=False, )
+ align_corners=False,
+ )
tenScoreTwo = paddle.nn.functional.interpolate(
tenScoreTwo,
size=(tenInput.shape[2], tenInput.shape[3]),
mode="bilinear",
- align_corners=False, )
+ align_corners=False,
+ )
tenScoreThr = paddle.nn.functional.interpolate(
tenScoreThr,
size=(tenInput.shape[2], tenInput.shape[3]),
mode="bilinear",
- align_corners=False, )
+ align_corners=False,
+ )
tenScoreFou = paddle.nn.functional.interpolate(
tenScoreFou,
size=(tenInput.shape[2], tenInput.shape[3]),
mode="bilinear",
- align_corners=False, )
+ align_corners=False,
+ )
tenScoreFiv = paddle.nn.functional.interpolate(
tenScoreFiv,
size=(tenInput.shape[2], tenInput.shape[3]),
mode="bilinear",
- align_corners=False, )
+ align_corners=False,
+ )
- return self.netCombine(
- paddle.concat([
- tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv
- ], 1))
+ return self.netCombine(paddle.concat([tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv], 1))
-remote_model_path = "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams"
+remote_model_path = (
+ "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams"
+)
class HEDdetector:
def __init__(self, modelpath=None):
- modelpath = os.path.join(annotator_ckpts_path,
- "network-bsds500.pdparams")
+ modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pdparams")
if not os.path.exists(modelpath):
- from paddlenlp.utils.downloader import \
- get_path_from_url_with_filelock
+ from paddlenlp.utils.downloader import get_path_from_url_with_filelock
- get_path_from_url_with_filelock(
- remote_model_path, root_dir=annotator_ckpts_path)
+ get_path_from_url_with_filelock(remote_model_path, root_dir=annotator_ckpts_path)
self.model_path = modelpath
self.netNetwork = Network(modelpath)
self.netNetwork.eval()
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
index ecd0bf926d74d..543d0774c523a 100644
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
@@ -44,7 +44,6 @@ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
x[depth_pt < bg_th] = 0
y[depth_pt < bg_th] = 0
normal = np.stack([x, y, z], axis=2)
- normal /= np.sum(normal**2.0, axis=2, keepdims=True)**0.5
- normal_image = (normal * 127.5 + 127.5).clip(
- min=0, max=255).astype(np.uint8)
+ normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5
+ normal_image = (normal * 127.5 + 127.5).clip(min=0, max=255).astype(np.uint8)
return depth_image, normal_image
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
index f93fa96d31b20..4726391519074 100644
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
+++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
@@ -19,25 +19,23 @@
def checkmodel(model_dir, model_name):
- if not os.path.exists(
- os.path.join(model_dir, model_name, model_name + ".pdmodel")):
+ if not os.path.exists(os.path.join(model_dir, model_name, model_name + ".pdmodel")):
model_url = "https://bj.bcebos.com/v1/paddledet/models/dpt_hybrid.zip"
get_path_from_url_with_filelock(model_url, root_dir=model_dir)
class MidasInference:
def __init__(
- self,
- model_dir,
- model_name="dpt_hybrid",
- batchsize=8,
- device="GPU",
- run_mode="paddle", ):
+ self,
+ model_dir,
+ model_name="dpt_hybrid",
+ batchsize=8,
+ device="GPU",
+ run_mode="paddle",
+ ):
checkmodel(model_dir, model_name)
- model_file = os.path.join(model_dir, model_name,
- model_name + ".pdmodel")
- params_file = os.path.join(model_dir, model_name,
- model_name + ".pdiparams")
+ model_file = os.path.join(model_dir, model_name, model_name + ".pdmodel")
+ params_file = os.path.join(model_dir, model_name, model_name + ".pdiparams")
config = paddle_infer.Config(model_file, params_file)
self.batchsize = batchsize
if device == "GPU":
@@ -69,12 +67,12 @@ def __init__(
min_subgraph_size=3,
precision_mode=precision_map[run_mode],
use_static=False,
- use_calib_mode=False, )
+ use_calib_mode=False,
+ )
min_input_shape = {"image": [1, 3, 224, 224]}
max_input_shape = {"image": [1, 3, 1280, 1280]}
opt_input_shape = {"image": [1, 3, 384, 384]}
- config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
- opt_input_shape)
+ config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
# disable print log when predict
config.disable_glog_info()
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
index 8e9d9e35206a6..8e453eef33c28 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
@@ -27,13 +27,11 @@
class MLSDdetector:
def __init__(self):
- model_path = os.path.join(annotator_ckpts_path,
- "mlsd_large_512_fp32.pdparams")
+ model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pdparams")
if not os.path.exists(model_path):
from basicsr.utils.download_util import load_file_from_url
- load_file_from_url(
- remote_model_path, model_dir=annotator_ckpts_path)
+ load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
self.model = MobileV2_MLSD_Large()
self.model.eval()
self.model.set_dict(paddle.load(model_path))
@@ -43,10 +41,8 @@ def __call__(self, input_image, thr_v, thr_d):
img = input_image
img_output = np.zeros_like(img)
with paddle.no_grad():
- lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]],
- thr_v, thr_d)
+ lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
for line in lines:
x_start, y_start, x_end, y_end = [int(val) for val in line]
- cv2.line(img_output, (x_start, y_start), (x_end, y_end),
- [255, 255, 255], 1)
+ cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
return img_output[:, :, (0)]
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
index c1f08257cff39..d9123b0102d3c 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
@@ -20,35 +20,36 @@ class BlockTypeA(paddle.nn.Layer):
def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True):
super(BlockTypeA, self).__init__()
self.conv1 = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=in_c2, out_channels=out_c2, kernel_size=1),
+ paddle.nn.Conv2D(in_channels=in_c2, out_channels=out_c2, kernel_size=1),
paddle.nn.BatchNorm2D(
num_features=out_c2,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
self.conv2 = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=in_c1, out_channels=out_c1, kernel_size=1),
+ paddle.nn.Conv2D(in_channels=in_c1, out_channels=out_c1, kernel_size=1),
paddle.nn.BatchNorm2D(
num_features=out_c1,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
self.upscale = upscale
def forward(self, a, b):
b = self.conv1(b)
a = self.conv2(a)
if self.upscale:
- b = paddle.nn.functional.interpolate(
- x=b, scale_factor=2.0, mode="bilinear", align_corners=True)
+ b = paddle.nn.functional.interpolate(x=b, scale_factor=2.0, mode="bilinear", align_corners=True)
return paddle.concat(x=(a, b), axis=1)
@@ -56,27 +57,29 @@ class BlockTypeB(paddle.nn.Layer):
def __init__(self, in_c, out_c):
super(BlockTypeB, self).__init__()
self.conv1 = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
+ paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
paddle.nn.BatchNorm2D(
num_features=in_c,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
self.conv2 = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1),
+ paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1),
paddle.nn.BatchNorm2D(
num_features=out_c,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
def forward(self, x):
x = self.conv1(x) + x
@@ -93,28 +96,31 @@ def __init__(self, in_c, out_c):
out_channels=in_c,
kernel_size=3,
padding=5,
- dilation=5, ),
+ dilation=5,
+ ),
paddle.nn.BatchNorm2D(
num_features=in_c,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
self.conv2 = paddle.nn.Sequential(
- paddle.nn.Conv2D(
- in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
+ paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
paddle.nn.BatchNorm2D(
num_features=in_c,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU(), )
- self.conv3 = paddle.nn.Conv2D(
- in_channels=in_c, out_channels=out_c, kernel_size=1)
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU(),
+ )
+ self.conv3 = paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=1)
def forward(self, x):
x = self.conv1(x)
@@ -143,8 +149,7 @@ def _make_divisible(v, divisor, min_value=None):
class ConvBNReLU(paddle.nn.Sequential):
- def __init__(self, in_planes, out_planes, kernel_size=3, stride=1,
- groups=1):
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
self.channel_pad = out_planes - in_planes
self.stride = stride
if stride == 2:
@@ -159,23 +164,23 @@ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1,
stride=stride,
padding=padding,
groups=groups,
- bias_attr=False, ),
+ bias_attr=False,
+ ),
paddle.nn.BatchNorm2D(
num_features=out_planes,
momentum=1 - 0.1,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
- use_global_stats=True, ),
- paddle.nn.ReLU6(), )
+ use_global_stats=True,
+ ),
+ paddle.nn.ReLU6(),
+ )
self.max_pool = paddle.nn.MaxPool2D(kernel_size=stride, stride=stride)
def forward(self, x):
if self.stride == 2:
- x = paddle.nn.functional.pad(x=x,
- pad=(0, 1, 0, 1),
- mode="constant",
- value=0)
+ x = paddle.nn.functional.pad(x=x, pad=(0, 1, 0, 1), mode="constant", value=0)
for module in self:
if not isinstance(module, paddle.nn.MaxPool2D):
x = module(x)
@@ -192,24 +197,27 @@ def __init__(self, inp, oup, stride, expand_ratio):
layers = []
if expand_ratio != 1:
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
- layers.extend([
- ConvBNReLU(
- hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
- paddle.nn.Conv2D(
- in_channels=hidden_dim,
- out_channels=oup,
- kernel_size=1,
- stride=1,
- padding=0,
- bias_attr=False, ),
- paddle.nn.BatchNorm2D(
- num_features=oup,
- momentum=1 - 0.1,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None,
- use_global_stats=True, ),
- ])
+ layers.extend(
+ [
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+ paddle.nn.Conv2D(
+ in_channels=hidden_dim,
+ out_channels=oup,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias_attr=False,
+ ),
+ paddle.nn.BatchNorm2D(
+ num_features=oup,
+ momentum=1 - 0.1,
+ epsilon=1e-05,
+ weight_attr=None,
+ bias_attr=None,
+ use_global_stats=True,
+ ),
+ ]
+ )
self.conv = paddle.nn.Sequential(*layers)
def forward(self, x):
@@ -244,23 +252,20 @@ def __init__(self):
[6, 64, 4, 2],
[6, 96, 3, 1],
]
- if (len(inverted_residual_setting) == 0 or
- len(inverted_residual_setting[0]) != 4):
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
raise ValueError(
- "inverted_residual_setting should be non-empty or a 4-element list, got {}".
- format(inverted_residual_setting))
- input_channel = _make_divisible(input_channel * width_mult,
- round_nearest)
- self.last_channel = _make_divisible(last_channel * max(1.0, width_mult),
- round_nearest)
+ "inverted_residual_setting should be non-empty or a 4-element list, got {}".format(
+ inverted_residual_setting
+ )
+ )
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
features = [ConvBNReLU(4, input_channel, stride=2)]
for t, c, n, s in inverted_residual_setting:
output_channel = _make_divisible(c * width_mult, round_nearest)
for i in range(n):
stride = s if i == 0 else 1
- features.append(
- block(
- input_channel, output_channel, stride, expand_ratio=t))
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
input_channel = output_channel
self.features = paddle.nn.Sequential(*features)
self.fpn_selected = [1, 3, 6, 10, 13]
@@ -295,8 +300,7 @@ class MobileV2_MLSD_Large(paddle.nn.Layer):
def __init__(self):
super(MobileV2_MLSD_Large, self).__init__()
self.backbone = MobileNetV2()
- self.block15 = BlockTypeA(
- in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False)
+ self.block15 = BlockTypeA(in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False)
self.block16 = BlockTypeB(128, 64)
self.block17 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64)
self.block18 = BlockTypeB(128, 64)
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
index e533433631fb1..1ad8429e69fb9 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
@@ -17,6 +17,7 @@
import cv2
import numpy as np
import paddle
+
"""
M-LSD
Copyright 2021-present NAVER Corp.
@@ -48,11 +49,7 @@ def zeros_(tensor):
return _no_grad_fill_(tensor, 0)
-def kaiming_normal_(tensor,
- a=0,
- mode="fan_in",
- nonlinearity="leaky_relu",
- reverse=False):
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
"""
Modified tensor inspace using kaiming_normal_
Args:
@@ -100,13 +97,11 @@ def _calculate_gain(nonlinearity, param=None):
elif nonlinearity == "leaky_relu":
if param is None:
negative_slope = 0.01
- elif (not isinstance(param, bool) and isinstance(param, int) or
- isinstance(param, float)):
+ elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
# True/False are instances of int, hence check above
negative_slope = param
else:
- raise ValueError("negative_slope {} not a valid number".format(
- param))
+ raise ValueError("negative_slope {} not a valid number".format(param))
return math.sqrt(2.0 / (1 + negative_slope**2))
elif nonlinearity == "selu":
return 3.0 / 4
@@ -119,8 +114,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False):
mode = mode.lower()
valid_modes = ["fan_in", "fan_out"]
if mode not in valid_modes:
- raise ValueError("Mode {} not supported, please use one of {}".format(
- mode, valid_modes))
+ raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
@@ -137,9 +131,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False):
Tuple[fan_in, fan_out]
"""
if tensor.ndim < 2:
- raise ValueError(
- "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
- )
+ raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
if reverse:
num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
@@ -168,8 +160,8 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
center = tpMap[:, (0), :, :]
heat = paddle.nn.functional.sigmoid(x=center).unsqueeze(0)
hmax = paddle.nn.functional.max_pool2d(
- kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2,
- x=heat).squeeze(0)
+ kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2, x=heat
+ ).squeeze(0)
keep = (hmax == heat).astype(dtype="float32")
heat = heat * keep
heat = heat.reshape([-1])
@@ -185,21 +177,16 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
return ptss, scores, displacement
-def pred_lines(image,
- model,
- input_shape=[512, 512],
- score_thr=0.1,
- dist_thr=20.0):
+def pred_lines(image, model, input_shape=[512, 512], score_thr=0.1, dist_thr=20.0):
h, w, _ = image.shape
h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
resized_image = np.concatenate(
[
- cv2.resize(
- image, (input_shape[1], input_shape[0]),
- interpolation=cv2.INTER_AREA),
+ cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
np.ones([input_shape[0], input_shape[1], 1]),
],
- axis=-1, )
+ axis=-1,
+ )
resized_image = resized_image.transpose((2, 0, 1))
batch_image = np.expand_dims(resized_image, axis=0).astype("float32")
batch_image = batch_image / 127.5 - 1.0
@@ -208,14 +195,13 @@ def pred_lines(image,
pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
start = vmap[:, :, :2]
end = vmap[:, :, 2:]
- dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
+ dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
segments_list = []
for center, score in zip(pts, pts_score):
y, x = center
distance = dist_map[y, x]
if score > score_thr and distance > dist_thr:
- disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (
- x), :]
+ disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (x), :]
x_start = x + disp_x_start
y_start = y + disp_y_start
x_end = x + disp_x_end
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
index 7dc16bd999550..e07f249e8c9fe 100644
--- a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
@@ -33,25 +33,19 @@ def __call__(self, oriImg, hand=False):
with paddle.no_grad():
canvas = oriImg[:, :, ::-1].copy()
canvas.fill(0)
- result = self.body_estimation.predict(
- oriImg, save_path="saved_images", visualization=False)
- canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
- result["subset"])
+ result = self.body_estimation.predict(oriImg, save_path="saved_images", visualization=False)
+ canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
if hand:
- hands_list = util.hand_detect(result["candidate"],
- result["subset"], oriImg)
+ hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
all_hand_peaks = []
for x, y, w, is_left in hands_list:
scale_search = [0.5, 1.0, 1.5, 2.0]
peaks = self.hand_estimation.hand_estimation(
- oriImg[y:y + w, x:x + w, :], scale_search=scale_search)
- peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0],
- peaks[:, 0] + x)
- peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1],
- peaks[:, 1] + y)
+ oriImg[y : y + w, x : x + w, :], scale_search=scale_search
+ )
+ peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+ peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
all_hand_peaks.append(peaks)
canvas = self.hand_estimation.draw_hand(canvas, all_hand_peaks)
- return canvas, dict(
- candidate=result["candidate"].tolist(),
- subset=result["subset"].tolist())
+ return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/util.py b/ppdiffusers/examples/controlnet/annotator/openpose/util.py
index 10028380bbd8a..899e38121eaea 100644
--- a/ppdiffusers/examples/controlnet/annotator/openpose/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/openpose/util.py
@@ -47,8 +47,7 @@ def pad_right_down_corner(img, stride, padValue):
def transfer(model, model_weights):
transfered_model_weights = {}
for weights_name in model.state_dict().keys():
- transfered_model_weights[weights_name] = model_weights[".".join(
- weights_name.split(".")[1:])]
+ transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
return transfered_model_weights
@@ -114,11 +113,9 @@ def draw_bodypose(canvas, candidate, subset):
X = candidate[index.astype(int), 1]
mX = np.mean(X)
mY = np.mean(Y)
- length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
- polygon = cv2.ellipse2Poly((int(mY), int(mX)),
- (int(length / 2), stickwidth),
- int(angle), 0, 360, 1)
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
return canvas
@@ -158,9 +155,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
canvas,
(x1, y1),
(x2, y2),
- matplotlib.colors.hsv_to_rgb(
- [ie / float(len(edges)), 1.0, 1.0]) * 255,
- thickness=2, )
+ matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
+ thickness=2,
+ )
for i, keyponit in enumerate(peaks):
x, y = keyponit
@@ -173,7 +170,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
cv2.FONT_HERSHEY_SIMPLEX,
0.3,
(0, 0, 0),
- lineType=cv2.LINE_AA, )
+ lineType=cv2.LINE_AA,
+ )
return canvas
@@ -194,16 +192,14 @@ def hand_detect(candidate, subset, oriImg):
hands = []
# left hand
if has_left:
- left_shoulder_index, left_elbow_index, left_wrist_index = person[
- [5, 6, 7]]
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
x1, y1 = candidate[left_shoulder_index][:2]
x2, y2 = candidate[left_elbow_index][:2]
x3, y3 = candidate[left_wrist_index][:2]
hands.append([x1, y1, x2, y2, x3, y3, True])
# right hand
if has_right:
- right_shoulder_index, right_elbow_index, right_wrist_index = person[
- [2, 3, 4]]
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
x1, y1 = candidate[right_shoulder_index][:2]
x2, y2 = candidate[right_elbow_index][:2]
x3, y3 = candidate[right_wrist_index][:2]
@@ -218,8 +214,8 @@ def hand_detect(candidate, subset, oriImg):
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
x = x3 + ratioWristElbow * (x3 - x2)
y = y3 + ratioWristElbow * (y3 - y2)
- distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2)
- distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
# x-y refers to the center --> offset to topLeft point
# handRectangle.x -= handRectangle.width / 2.f;
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
index 0bb742e72d02a..d2d5ee7249851 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
@@ -39,10 +39,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list):
l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
- neck_keypoint_y = int(
- (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
- neck_keypoint_x = int(
- (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
+ neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
+ neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
neck_keypoint = [
neck_keypoint_x,
neck_keypoint_y,
@@ -65,33 +63,24 @@ def __call__(self, oriImg, detect_resolution=512, hand=False):
img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
result = self.ppdetpose_pred(oriImg)
result["candidate"] = result["candidate"] * img_scalarfactor
- oriImg = cv2.resize(
- oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
+ oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
canvas = oriImg.copy()
canvas.fill(0)
- canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
- result["subset"])
+ canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
if hand:
- hands_list = util.hand_detect(result["candidate"],
- result["subset"], oriImg)
+ hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
all_hand_peaks = []
for x, y, w, is_left in hands_list:
- scale_search = [
- x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]
- ]
+ scale_search = [x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]]
peaks = self.hand_estimation.hand_estimation(
- oriImg[y:y + w, x:x + w, ::-1],
- scale_search=scale_search)
- peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0],
- peaks[:, 0] + x)
- peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1],
- peaks[:, 1] + y)
+ oriImg[y : y + w, x : x + w, ::-1], scale_search=scale_search
+ )
+ peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+ peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
all_hand_peaks.append(peaks)
canvas = util.draw_handpose(canvas, all_hand_peaks)
- return canvas, dict(
- candidate=result["candidate"].tolist(),
- subset=result["subset"].tolist())
+ return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
def ppdetpose_pred(self, image, kpt_threshold=0.3):
poseres = self.ppdetpose.ppdet_hrnet_infer(image)
@@ -105,7 +94,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3):
for idx, item in enumerate(openpose_kpts):
if item[2] > kpt_threshold:
subset[kptid][idx] = posnum
- kpt = np.array(item + [posnum, ])
+ kpt = np.array(
+ item
+ + [
+ posnum,
+ ]
+ )
candidate = np.vstack((candidate, kpt))
posnum += 1
return {"candidate": candidate, "subset": subset}
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
index 86f7aca10c143..9236875761299 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
@@ -25,13 +25,14 @@
class PaddleInferBenchmark(object):
def __init__(
- self,
- config,
- model_info: dict={},
- data_info: dict={},
- perf_info: dict={},
- resource_info: dict={},
- **kwargs, ):
+ self,
+ config,
+ model_info: dict = {},
+ data_info: dict = {},
+ perf_info: dict = {},
+ resource_info: dict = {},
+ **kwargs,
+ ):
"""
Construct PaddleInferBenchmark Class to format logs.
args:
@@ -84,8 +85,7 @@ def __init__(
self.inference_time_s = round(perf_info["inference_time_s"], 4)
except:
self.print_help()
- raise ValueError(
- "Set argument wrong, please check input argument and its type")
+ raise ValueError("Set argument wrong, please check input argument and its type")
self.preprocess_time_s = perf_info.get("preprocess_time_s", 0)
self.postprocess_time_s = perf_info.get("postprocess_time_s", 0)
@@ -142,13 +142,12 @@ def benchmark_logger(self):
level=logging.INFO,
format=FORMAT,
handlers=[
- logging.FileHandler(
- filename=log_output, mode="w"),
+ logging.FileHandler(filename=log_output, mode="w"),
logging.StreamHandler(),
- ], )
+ ],
+ )
self.logger = logging.getLogger(__name__)
- self.logger.info(
- f"Paddle Inference benchmark log will be saved to {log_output}")
+ self.logger.info(f"Paddle Inference benchmark log will be saved to {log_output}")
def parse_config(self, config) -> dict:
"""
@@ -160,28 +159,22 @@ def parse_config(self, config) -> dict:
"""
if isinstance(config, paddle_infer.Config):
config_status = {}
- config_status["runtime_device"] = "gpu" if config.use_gpu(
- ) else "cpu"
+ config_status["runtime_device"] = "gpu" if config.use_gpu() else "cpu"
config_status["ir_optim"] = config.ir_optim()
config_status["enable_tensorrt"] = config.tensorrt_engine_enabled()
config_status["precision"] = self.precision
config_status["enable_mkldnn"] = config.mkldnn_enabled()
- config_status[
- "cpu_math_library_num_threads"] = config.cpu_math_library_num_threads(
- )
+ config_status["cpu_math_library_num_threads"] = config.cpu_math_library_num_threads()
elif isinstance(config, dict):
config_status["runtime_device"] = config.get("runtime_device", "")
config_status["ir_optim"] = config.get("ir_optim", "")
config_status["enable_tensorrt"] = config.get("enable_tensorrt", "")
config_status["precision"] = config.get("precision", "")
config_status["enable_mkldnn"] = config.get("enable_mkldnn", "")
- config_status["cpu_math_library_num_threads"] = config.get(
- "cpu_math_library_num_threads", "")
+ config_status["cpu_math_library_num_threads"] = config.get("cpu_math_library_num_threads", "")
else:
self.print_help()
- raise ValueError(
- "Set argument config wrong, please check input argument and its type"
- )
+ raise ValueError("Set argument config wrong, please check input argument and its type")
return config_status
def report(self, identifier=None):
@@ -196,54 +189,43 @@ def report(self, identifier=None):
identifier = ""
self.logger.info("\n")
- self.logger.info(
- "---------------------- Paddle info ----------------------")
+ self.logger.info("---------------------- Paddle info ----------------------")
self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}")
self.logger.info(f"{identifier} log_api_version: {self.log_version}")
- self.logger.info(
- "----------------------- Conf info -----------------------")
- self.logger.info(
- f"{identifier} runtime_device: {self.config_status['runtime_device']}"
- )
- self.logger.info(
- f"{identifier} ir_optim: {self.config_status['ir_optim']}")
+ self.logger.info("----------------------- Conf info -----------------------")
+ self.logger.info(f"{identifier} runtime_device: {self.config_status['runtime_device']}")
+ self.logger.info(f"{identifier} ir_optim: {self.config_status['ir_optim']}")
self.logger.info(f"{identifier} enable_memory_optim: {True}")
- self.logger.info(
- f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}"
- )
- self.logger.info(
- f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
+ self.logger.info(f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}")
+ self.logger.info(f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
self.logger.info(
f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
)
- self.logger.info(
- "----------------------- Model info ----------------------")
+ self.logger.info("----------------------- Model info ----------------------")
self.logger.info(f"{identifier} model_name: {self.model_name}")
self.logger.info(f"{identifier} precision: {self.precision}")
- self.logger.info(
- "----------------------- Data info -----------------------")
+ self.logger.info("----------------------- Data info -----------------------")
self.logger.info(f"{identifier} batch_size: {self.batch_size}")
self.logger.info(f"{identifier} input_shape: {self.shape}")
self.logger.info(f"{identifier} data_num: {self.data_num}")
- self.logger.info(
- "----------------------- Perf info -----------------------")
+ self.logger.info("----------------------- Perf info -----------------------")
self.logger.info(
f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%"
)
self.logger.info(
f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%"
)
- self.logger.info(
- f"{identifier} total time spent(s): {self.total_time_s}")
+ self.logger.info(f"{identifier} total time spent(s): {self.total_time_s}")
if self.with_tracker:
self.logger.info(
f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
- f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}")
+ f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}"
+ )
else:
self.logger.info(
f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
@@ -261,7 +243,8 @@ def print_help(self):
"""
print function help
"""
- print("""Usage:
+ print(
+ """Usage:
==== Print inference benchmark logs. ====
config = paddle.inference.Config()
model_info = {'model_name': 'resnet50'
@@ -278,7 +261,8 @@ def print_help(self):
'gpu_util': 60}
log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info)
log('Test')
- """)
+ """
+ )
def __call__(self, identifier=None):
"""
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
index a89c4c830c5be..3d3a8578fd2bd 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
@@ -24,8 +24,7 @@
from .det_keypoint_unite_utils import argsparser
from .infer import PredictConfig # noqa F401
-from .infer import (Detector, DetectorPicoDet, bench_log, get_test_images,
- print_arguments)
+from .infer import bench_log, get_test_images, print_arguments
from .keypoint_infer import KeyPointDetector
from .keypoint_postprocess import translate_to_ori_images
from .preprocess import decode_image
@@ -38,12 +37,10 @@
}
-def predict_with_given_det(image, det_res, keypoint_detector,
- keypoint_batch_size, run_benchmark):
+def predict_with_given_det(image, det_res, keypoint_detector, keypoint_batch_size, run_benchmark):
keypoint_res = {}
- rec_images, records, det_rects = keypoint_detector.get_person_from_rect(
- image, det_res)
+ rec_images, records, det_rects = keypoint_detector.get_person_from_rect(image, det_res)
if len(det_rects) == 0:
keypoint_res["keypoint"] = [[], []]
@@ -53,23 +50,22 @@ def predict_with_given_det(image, det_res, keypoint_detector,
score_vector = []
rect_vector = det_rects
- keypoint_results = keypoint_detector.predict_image(
- rec_images, run_benchmark, repeats=10, visual=False)
- keypoint_vector, score_vector = translate_to_ori_images(keypoint_results,
- np.array(records))
+ keypoint_results = keypoint_detector.predict_image(rec_images, run_benchmark, repeats=10, visual=False)
+ keypoint_vector, score_vector = translate_to_ori_images(keypoint_results, np.array(records))
keypoint_res["keypoint"] = (
- [keypoint_vector.tolist(), score_vector.tolist()]
- if len(keypoint_vector) > 0 else [[], []])
+ [keypoint_vector.tolist(), score_vector.tolist()] if len(keypoint_vector) > 0 else [[], []]
+ )
keypoint_res["bbox"] = rect_vector
return keypoint_res
def topdown_unite_predict(
- detector,
- topdown_keypoint_detector,
- image_list,
- keypoint_batch_size=1,
- save_res=False, ):
+ detector,
+ topdown_keypoint_detector,
+ image_list,
+ keypoint_batch_size=1,
+ save_res=False,
+):
det_timer = detector.get_timer()
store_res = []
for i, img_file in enumerate(image_list):
@@ -79,8 +75,7 @@ def topdown_unite_predict(
det_timer.preprocess_time_s.end()
if FLAGS.run_benchmark:
- results = detector.predict_image(
- [image], run_benchmark=True, repeats=10)
+ results = detector.predict_image([image], run_benchmark=True, repeats=10)
cm, gm, gu = get_current_memory_mb()
detector.cpu_mem += cm
@@ -95,15 +90,18 @@ def topdown_unite_predict(
results,
topdown_keypoint_detector,
keypoint_batch_size,
- FLAGS.run_benchmark, )
+ FLAGS.run_benchmark,
+ )
if save_res:
save_name = img_file if isinstance(img_file, str) else i
- store_res.append([
- save_name,
- keypoint_res["bbox"],
- [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
- ])
+ store_res.append(
+ [
+ save_name,
+ keypoint_res["bbox"],
+ [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
+ ]
+ )
else:
results["keypoint"] = [[], []]
keypoint_res = results
@@ -119,7 +117,8 @@ def topdown_unite_predict(
img_file,
keypoint_res,
visual_thresh=FLAGS.keypoint_threshold,
- save_dir=FLAGS.output_dir, )
+ save_dir=FLAGS.output_dir,
+ )
if save_res:
"""
1) store_res: a list of image_data
@@ -133,18 +132,17 @@ def topdown_unite_predict(
def topdown_unite_predict_singleimage(
- detector,
- topdown_keypoint_detector,
- image,
- keypoint_batch_size=8,
- det_threshold=0.25, ):
+ detector,
+ topdown_keypoint_detector,
+ image,
+ keypoint_batch_size=8,
+ det_threshold=0.25,
+):
results = detector.predict_image([image], visual=False)
results = detector.filter_box(results, det_threshold)
if results["boxes_num"] > 0:
- keypoint_res = predict_with_given_det(image, results,
- topdown_keypoint_detector,
- keypoint_batch_size, False)
+ keypoint_res = predict_with_given_det(image, results, topdown_keypoint_detector, keypoint_batch_size, False)
else:
results["keypoint"] = [[], []]
@@ -153,11 +151,12 @@ def topdown_unite_predict_singleimage(
def topdown_unite_predict_video(
- detector,
- topdown_keypoint_detector,
- camera_id,
- keypoint_batch_size=1,
- save_res=False, ):
+ detector,
+ topdown_keypoint_detector,
+ camera_id,
+ keypoint_batch_size=1,
+ save_res=False,
+):
video_name = "output.mp4"
if camera_id != -1:
capture = cv2.VideoCapture(camera_id)
@@ -174,12 +173,11 @@ def topdown_unite_predict_video(
if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir)
out_path = os.path.join(FLAGS.output_dir, video_name)
- fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
index = 0
store_res = []
- keypoint_smoothing = KeypointSmoothing(
- width, height, filter_type=FLAGS.filter_type, beta=0.05)
+ keypoint_smoothing = KeypointSmoothing(width, height, filter_type=FLAGS.filter_type, beta=0.05)
while 1:
ret, frame = capture.read()
@@ -201,27 +199,25 @@ def topdown_unite_predict_video(
results,
topdown_keypoint_detector,
keypoint_batch_size,
- FLAGS.run_benchmark, )
+ FLAGS.run_benchmark,
+ )
if FLAGS.smooth and len(keypoint_res["keypoint"][0]) == 1:
current_keypoints = np.array(keypoint_res["keypoint"][0][0])
- smooth_keypoints = keypoint_smoothing.smooth_process(
- current_keypoints)
+ smooth_keypoints = keypoint_smoothing.smooth_process(current_keypoints)
keypoint_res["keypoint"][0][0] = smooth_keypoints.tolist()
- im = visualize_pose(
- frame,
- keypoint_res,
- visual_thresh=FLAGS.keypoint_threshold,
- returnimg=True)
+ im = visualize_pose(frame, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, returnimg=True)
if save_res:
- store_res.append([
- index,
- keypoint_res["bbox"],
- [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
- ])
+ store_res.append(
+ [
+ index,
+ keypoint_res["bbox"],
+ [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
+ ]
+ )
writer.write(im)
if camera_id != -1:
@@ -247,37 +243,43 @@ class KeypointSmoothing(object):
# https://github.com/jaantollander/OneEuroFilter
def __init__(
- self,
- width,
- height,
- filter_type,
- alpha=0.5,
- fc_d=0.1,
- fc_min=0.1,
- beta=0.1,
- thres_mult=0.3, ):
+ self,
+ width,
+ height,
+ filter_type,
+ alpha=0.5,
+ fc_d=0.1,
+ fc_min=0.1,
+ beta=0.1,
+ thres_mult=0.3,
+ ):
super(KeypointSmoothing, self).__init__()
self.image_width = width
self.image_height = height
- self.threshold = (np.array([
- 0.005,
- 0.005,
- 0.005,
- 0.005,
- 0.005,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- 0.01,
- ]) * thres_mult)
+ self.threshold = (
+ np.array(
+ [
+ 0.005,
+ 0.005,
+ 0.005,
+ 0.005,
+ 0.005,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ 0.01,
+ ]
+ )
+ * thres_mult
+ )
self.filter_type = filter_type
self.alpha = alpha
self.dx_prev_hat = None
@@ -302,20 +304,18 @@ def smooth_process(self, current_keypoints):
result = current_keypoints
num_keypoints = len(current_keypoints)
for i in range(num_keypoints):
- result[i, :2] = self.smooth(current_keypoints[i, :2],
- self.threshold[i], i)
+ result[i, :2] = self.smooth(current_keypoints[i, :2], self.threshold[i], i)
return result
def smooth(self, current_keypoint, threshold, index):
distance = np.sqrt(
- np.square((current_keypoint[0] - self.x_prev_hat[index][0]) /
- self.image_width) + np.square((current_keypoint[
- 1] - self.x_prev_hat[index][1]) / self.image_height))
+ np.square((current_keypoint[0] - self.x_prev_hat[index][0]) / self.image_width)
+ + np.square((current_keypoint[1] - self.x_prev_hat[index][1]) / self.image_height)
+ )
if distance < threshold:
result = self.x_prev_hat[index]
else:
- result = self.smooth_func(current_keypoint, self.x_prev_hat[index],
- index)
+ result = self.smooth_func(current_keypoint, self.x_prev_hat[index], index)
return result
@@ -360,15 +360,13 @@ def exponential_smoothing(self, x_cur, x_pre, index=0):
det_threshold = 0.4
if not os.path.exists(det_model_dir):
- detmodel_url = "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip"
- get_path_from_url_with_filelock(
- detmodel_url, root_dir="annotator/ppdet_hrnet/models/")
-if not os.path.exists(keypoint_model_dir):
- kptmodel_url = (
- "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip"
+ detmodel_url = (
+ "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip"
)
- get_path_from_url_with_filelock(
- kptmodel_url, root_dir="annotator/ppdet_hrnet/models/")
+ get_path_from_url_with_filelock(detmodel_url, root_dir="annotator/ppdet_hrnet/models/")
+if not os.path.exists(keypoint_model_dir):
+ kptmodel_url = "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip"
+ get_path_from_url_with_filelock(kptmodel_url, root_dir="annotator/ppdet_hrnet/models/")
class PPDetPose(object):
@@ -391,7 +389,8 @@ def __init__(self) -> None:
trt_calib_mode=trt_calib_mode,
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn,
- threshold=det_threshold, )
+ threshold=det_threshold,
+ )
self.topdown_keypoint_detector = KeyPointDetector(
keypoint_model_dir,
@@ -404,7 +403,8 @@ def __init__(self) -> None:
trt_calib_mode=trt_calib_mode,
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn,
- use_dark=use_dark, )
+ use_dark=use_dark,
+ )
keypoint_arch = self.topdown_keypoint_detector.pred_config.arch
assert (
KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
@@ -417,7 +417,8 @@ def ppdet_hrnet_infer(self, image):
self.topdown_keypoint_detector,
image,
keypoint_batch_size,
- det_threshold, )
+ det_threshold,
+ )
def main():
@@ -439,7 +440,8 @@ def main():
trt_calib_mode=FLAGS.trt_calib_mode,
cpu_threads=FLAGS.cpu_threads,
enable_mkldnn=FLAGS.enable_mkldnn,
- threshold=FLAGS.det_threshold, )
+ threshold=FLAGS.det_threshold,
+ )
topdown_keypoint_detector = KeyPointDetector(
FLAGS.keypoint_model_dir,
@@ -452,7 +454,8 @@ def main():
trt_calib_mode=FLAGS.trt_calib_mode,
cpu_threads=FLAGS.cpu_threads,
enable_mkldnn=FLAGS.enable_mkldnn,
- use_dark=FLAGS.use_dark, )
+ use_dark=FLAGS.use_dark,
+ )
keypoint_arch = topdown_keypoint_detector.pred_config.arch
assert (
KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
@@ -465,7 +468,8 @@ def main():
topdown_keypoint_detector,
FLAGS.camera_id,
FLAGS.keypoint_batch_size,
- FLAGS.save_res, )
+ FLAGS.save_res,
+ )
else:
# predict from image
img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
@@ -474,7 +478,8 @@ def main():
topdown_keypoint_detector,
img_list,
FLAGS.keypoint_batch_size,
- FLAGS.save_res, )
+ FLAGS.save_res,
+ )
if not FLAGS.run_benchmark:
detector.det_times.info(average=True)
topdown_keypoint_detector.det_times.info(average=True)
@@ -496,7 +501,8 @@ def main():
img_list,
keypoint_model_info,
FLAGS.keypoint_batch_size,
- "KeyPoint", )
+ "KeyPoint",
+ )
if __name__ == "__main__":
@@ -505,7 +511,6 @@ def main():
FLAGS = parser.parse_args()
print_arguments(FLAGS)
FLAGS.device = FLAGS.device.upper()
- assert FLAGS.device in ["CPU", "GPU", "XPU"
- ], "device should be CPU, GPU or XPU"
+ assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
index 5290e03d818fa..0d023a6d28d57 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
@@ -22,58 +22,60 @@ def argsparser():
"--det_model_dir",
type=str,
default=None,
- help=("Directory include:'model.pdiparams', 'model.pdmodel', "
- "'infer_cfg.yml', created by tools/export_model.py."),
- required=True, )
+ help=(
+ "Directory include:'model.pdiparams', 'model.pdmodel', "
+ "'infer_cfg.yml', created by tools/export_model.py."
+ ),
+ required=True,
+ )
parser.add_argument(
"--keypoint_model_dir",
type=str,
default=None,
- help=("Directory include:'model.pdiparams', 'model.pdmodel', "
- "'infer_cfg.yml', created by tools/export_model.py."),
- required=True, )
- parser.add_argument(
- "--image_file", type=str, default=None, help="Path of image file.")
+ help=(
+ "Directory include:'model.pdiparams', 'model.pdmodel', "
+ "'infer_cfg.yml', created by tools/export_model.py."
+ ),
+ required=True,
+ )
+ parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
parser.add_argument(
"--image_dir",
type=str,
default=None,
- help="Dir of image file, `image_file` has a higher priority.", )
+ help="Dir of image file, `image_file` has a higher priority.",
+ )
parser.add_argument(
"--keypoint_batch_size",
type=int,
default=8,
- help=("batch_size for keypoint inference. In detection-keypoint unit"
- "inference, the batch size in detection is 1. Then collate det "
- "result in batch for keypoint inference."), )
+ help=(
+ "batch_size for keypoint inference. In detection-keypoint unit"
+ "inference, the batch size in detection is 1. Then collate det "
+ "result in batch for keypoint inference."
+ ),
+ )
parser.add_argument(
"--video_file",
type=str,
default=None,
help="Path of video file, `video_file` or `camera_id` has a highest priority.",
)
- parser.add_argument(
- "--camera_id",
- type=int,
- default=-1,
- help="device id of camera to predict.")
- parser.add_argument(
- "--det_threshold", type=float, default=0.5, help="Threshold of score.")
- parser.add_argument(
- "--keypoint_threshold",
- type=float,
- default=0.5,
- help="Threshold of score.")
+ parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
+ parser.add_argument("--det_threshold", type=float, default=0.5, help="Threshold of score.")
+ parser.add_argument("--keypoint_threshold", type=float, default=0.5, help="Threshold of score.")
parser.add_argument(
"--output_dir",
type=str,
default="output",
- help="Directory of output visualization files.", )
+ help="Directory of output visualization files.",
+ )
parser.add_argument(
"--run_mode",
type=str,
default="paddle",
- help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", )
+ help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)",
+ )
parser.add_argument(
"--device",
type=str,
@@ -84,32 +86,24 @@ def argsparser():
"--run_benchmark",
type=ast.literal_eval,
default=False,
- help="Whether to predict a image_file repeatedly for benchmark", )
+ help="Whether to predict a image_file repeatedly for benchmark",
+ )
parser.add_argument(
"--enable_mkldnn",
type=ast.literal_eval,
default=False,
- help="Whether use mkldnn with CPU.", )
- parser.add_argument(
- "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
- parser.add_argument(
- "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
- parser.add_argument(
- "--trt_max_shape",
- type=int,
- default=1280,
- help="max_shape for TensorRT.")
- parser.add_argument(
- "--trt_opt_shape",
- type=int,
- default=640,
- help="opt_shape for TensorRT.")
+ help="Whether use mkldnn with CPU.",
+ )
+ parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+ parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+ parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
+ parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
parser.add_argument(
"--trt_calib_mode",
type=bool,
default=False,
- help="If the model is produced by TRT offline quantitative "
- "calibration, trt_calib_mode need to set True.", )
+ help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
+ )
parser.add_argument(
"--use_dark",
type=ast.literal_eval,
@@ -126,7 +120,9 @@ def argsparser():
"2) image_data: [imageid, rects, [keypoints, scores]]"
"3) rects: list of rect [xmin, ymin, xmax, ymax]"
"4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list"
- "5) scores: mean of all joint conf"), )
+ "5) scores: mean of all joint conf"
+ ),
+ )
parser.add_argument(
"--smooth",
type=ast.literal_eval,
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
index a2a9769e224e0..6d4135cdfb9a6 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
@@ -25,16 +25,17 @@
from paddle.inference import Config, create_predictor
from .benchmark_utils import PaddleInferBenchmark
-from .keypoint_preprocess import (
- EvalAffine,
- TopDownEvalAffine, # noqa F401
- expand_crop)
+from .keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop # noqa F401
from .picodet_postprocess import PicoDetPostProcess
from .preprocess import Pad # noqa F401
-from .preprocess import (LetterBoxResize, NormalizeImage, PadStride, Permute,
- Resize, WarpAffine, decode_image, preprocess)
-from .utils import (Timer, argsparser, coco_clsid2catid, get_current_memory_mb,
- multiclass_nms)
+from .preprocess import preprocess
+from .utils import (
+ Timer,
+ argsparser,
+ coco_clsid2catid,
+ get_current_memory_mb,
+ multiclass_nms,
+)
from .visualize import visualize_box_mask
# Global dictionary
@@ -81,8 +82,7 @@ def bench_log(detector, img_list, model_info, batch_size=1, name=None):
"shape": "dynamic_shape",
"data_num": perf_info["img_num"],
}
- log = PaddleInferBenchmark(detector.config, model_info, data_info,
- perf_info, mems)
+ log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
log(name)
@@ -109,21 +109,22 @@ class Detector(object):
"""
def __init__(
- self,
- model_dir,
- device="CPU",
- run_mode="paddle",
- batch_size=1,
- trt_min_shape=1,
- trt_max_shape=1280,
- trt_opt_shape=640,
- trt_calib_mode=False,
- cpu_threads=1,
- enable_mkldnn=False,
- enable_mkldnn_bfloat16=False,
- output_dir="output",
- threshold=0.5,
- delete_shuffle_pass=False, ):
+ self,
+ model_dir,
+ device="CPU",
+ run_mode="paddle",
+ batch_size=1,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False,
+ cpu_threads=1,
+ enable_mkldnn=False,
+ enable_mkldnn_bfloat16=False,
+ output_dir="output",
+ threshold=0.5,
+ delete_shuffle_pass=False,
+ ):
self.pred_config = self.set_config(model_dir)
self.predictor, self.config = load_predictor(
model_dir,
@@ -140,7 +141,8 @@ def __init__(
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn,
enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
- delete_shuffle_pass=delete_shuffle_pass, )
+ delete_shuffle_pass=delete_shuffle_pass,
+ )
self.det_times = Timer()
self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
self.batch_size = batch_size
@@ -177,9 +179,7 @@ def preprocess(self, image_list):
def postprocess(self, inputs, result):
# postprocess output of predictor
np_boxes_num = result["boxes_num"]
- assert isinstance(
- np_boxes_num,
- np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`"
+ assert isinstance(np_boxes_num, np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`"
result = {k: v for k, v in result.items() if v is not None}
return result
@@ -192,7 +192,7 @@ def filter_box(self, result, threshold):
filter_num = []
for i in range(len(np_boxes_num)):
boxes_num = np_boxes_num[i]
- boxes_i = boxes[start_idx:start_idx + boxes_num, :]
+ boxes_i = boxes[start_idx : start_idx + boxes_num, :]
idx = boxes_i[:, 1] > threshold
filter_boxes_i = boxes_i[idx, :]
filter_boxes.append(filter_boxes_i)
@@ -220,8 +220,7 @@ def predict(self, repeats=1, run_benchmark=False):
for i in range(repeats):
self.predictor.run()
paddle.device.cuda.synchronize()
- result = dict(
- boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+ result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
return result
for i in range(repeats):
@@ -258,17 +257,18 @@ def get_timer(self):
return self.det_times
def predict_image_slice(
- self,
- img_list,
- slice_size=[640, 640],
- overlap_ratio=[0.25, 0.25],
- combine_method="nms",
- match_threshold=0.6,
- match_metric="ios",
- run_benchmark=False,
- repeats=1,
- visual=True,
- save_results=False, ):
+ self,
+ img_list,
+ slice_size=[640, 640],
+ overlap_ratio=[0.25, 0.25],
+ combine_method="nms",
+ match_threshold=0.6,
+ match_metric="ios",
+ run_benchmark=False,
+ repeats=1,
+ visual=True,
+ save_results=False,
+ ):
# slice infer only support bs=1
results = []
try:
@@ -287,14 +287,13 @@ def predict_image_slice(
slice_height=slice_size[0],
slice_width=slice_size[1],
overlap_height_ratio=overlap_ratio[0],
- overlap_width_ratio=overlap_ratio[1], )
+ overlap_width_ratio=overlap_ratio[1],
+ )
sub_img_num = len(slice_image_result)
merged_bboxs = []
print("slice to {} sub_samples.", sub_img_num)
- batch_image_list = [
- slice_image_result.images[_ind] for _ind in range(sub_img_num)
- ]
+ batch_image_list = [slice_image_result.images[_ind] for _ind in range(sub_img_num)]
if run_benchmark:
# preprocess
inputs = self.preprocess(batch_image_list) # warmup
@@ -341,10 +340,8 @@ def predict_image_slice(
boxes_num = result["boxes_num"][_ind]
ed = st + boxes_num
shift_amount = slice_image_result.starting_pixels[_ind]
- result["boxes"][st:ed][:, 2:4] = (
- result["boxes"][st:ed][:, 2:4] + shift_amount)
- result["boxes"][st:ed][:, 4:6] = (
- result["boxes"][st:ed][:, 4:6] + shift_amount)
+ result["boxes"][st:ed][:, 2:4] = result["boxes"][st:ed][:, 2:4] + shift_amount
+ result["boxes"][st:ed][:, 4:6] = result["boxes"][st:ed][:, 4:6] + shift_amount
merged_bboxs.append(result["boxes"][st:ed])
st = ed
@@ -354,16 +351,14 @@ def predict_image_slice(
np.concatenate(merged_bboxs),
num_classes,
match_threshold,
- match_metric, )
+ match_metric,
+ )
merged_results["boxes"] = np.concatenate(final_boxes)
elif combine_method == "concat":
merged_results["boxes"] = np.concatenate(merged_bboxs)
else:
- raise ValueError(
- "Now only support 'nms' or 'concat' to fuse detection results."
- )
- merged_results["boxes_num"] = np.array(
- [len(merged_results["boxes"])], dtype=np.int32)
+ raise ValueError("Now only support 'nms' or 'concat' to fuse detection results.")
+ merged_results["boxes_num"] = np.array([len(merged_results["boxes"])], dtype=np.int32)
if visual:
visualize(
@@ -371,24 +366,25 @@ def predict_image_slice(
merged_results,
self.pred_config.labels,
output_dir=self.output_dir,
- threshold=self.threshold, )
+ threshold=self.threshold,
+ )
results.append(merged_results)
results = self.merge_batch_result(results)
if save_results:
Path(self.output_dir).mkdir(exist_ok=True)
- self.save_coco_results(
- img_list, results, use_coco_category=FLAGS.use_coco_category)
+ self.save_coco_results(img_list, results, use_coco_category=FLAGS.use_coco_category)
return results
def predict_image(
- self,
- image_list,
- run_benchmark=False,
- repeats=1,
- visual=True,
- save_results=False, ):
+ self,
+ image_list,
+ run_benchmark=False,
+ repeats=1,
+ visual=True,
+ save_results=False,
+ ):
batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
results = []
for i in range(batch_loop_cnt):
@@ -442,13 +438,13 @@ def predict_image(
result,
self.pred_config.labels,
output_dir=self.output_dir,
- threshold=self.threshold, )
+ threshold=self.threshold,
+ )
results.append(result)
results = self.merge_batch_result(results)
if save_results:
Path(self.output_dir).mkdir(exist_ok=True)
- self.save_coco_results(
- image_list, results, use_coco_category=FLAGS.use_coco_category)
+ self.save_coco_results(image_list, results, use_coco_category=FLAGS.use_coco_category)
return results
def predict_video(self, video_file, camera_id):
@@ -468,7 +464,7 @@ def predict_video(self, video_file, camera_id):
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
out_path = os.path.join(self.output_dir, video_out_name)
- fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
index = 1
while 1:
@@ -479,11 +475,7 @@ def predict_video(self, video_file, camera_id):
index += 1
results = self.predict_image([frame[:, :, ::-1]], visual=False)
- im = visualize_box_mask(
- frame,
- results,
- self.pred_config.labels,
- threshold=self.threshold)
+ im = visualize_box_mask(frame, results, self.pred_config.labels, threshold=self.threshold)
im = np.array(im)
writer.write(im)
if camera_id != -1:
@@ -505,43 +497,44 @@ def save_coco_results(self, image_list, results, use_coco_category=False):
img_id = i
if "boxes" in results:
- boxes = results["boxes"][idx:idx + box_num].tolist()
- bbox_results.extend([
- {
- "image_id": img_id,
- "category_id": coco_clsid2catid[int(box[0])]
- if use_coco_category else int(box[0]),
- "file_name": file_name,
- "bbox": [
- box[2],
- box[3],
- box[4] - box[2],
- box[5] - box[3],
- ], # xyxy -> xywh
- "score": box[1],
- } for box in boxes
- ])
+ boxes = results["boxes"][idx : idx + box_num].tolist()
+ bbox_results.extend(
+ [
+ {
+ "image_id": img_id,
+ "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
+ "file_name": file_name,
+ "bbox": [
+ box[2],
+ box[3],
+ box[4] - box[2],
+ box[5] - box[3],
+ ], # xyxy -> xywh
+ "score": box[1],
+ }
+ for box in boxes
+ ]
+ )
if "masks" in results:
import pycocotools.mask as mask_util
- boxes = results["boxes"][idx:idx + box_num].tolist()
+ boxes = results["boxes"][idx : idx + box_num].tolist()
masks = results["masks"][i][:box_num].astype(np.uint8)
seg_res = []
for box, mask in zip(boxes, masks):
- rle = mask_util.encode(
- np.array(
- mask[:, :, None], dtype=np.uint8, order="F"))[0]
+ rle = mask_util.encode(np.array(mask[:, :, None], dtype=np.uint8, order="F"))[0]
if "counts" in rle:
rle["counts"] = rle["counts"].decode("utf8")
- seg_res.append({
- "image_id": img_id,
- "category_id": coco_clsid2catid[int(box[0])]
- if use_coco_category else int(box[0]),
- "file_name": file_name,
- "segmentation": rle,
- "score": box[1],
- })
+ seg_res.append(
+ {
+ "image_id": img_id,
+ "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
+ "file_name": file_name,
+ "segmentation": rle,
+ "score": box[1],
+ }
+ )
mask_results.extend(seg_res)
idx += box_num
@@ -579,20 +572,21 @@ class DetectorSOLOv2(Detector):
"""
def __init__(
- self,
- model_dir,
- device="CPU",
- run_mode="paddle",
- batch_size=1,
- trt_min_shape=1,
- trt_max_shape=1280,
- trt_opt_shape=640,
- trt_calib_mode=False,
- cpu_threads=1,
- enable_mkldnn=False,
- enable_mkldnn_bfloat16=False,
- output_dir="./",
- threshold=0.5, ):
+ self,
+ model_dir,
+ device="CPU",
+ run_mode="paddle",
+ batch_size=1,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False,
+ cpu_threads=1,
+ enable_mkldnn=False,
+ enable_mkldnn_bfloat16=False,
+ output_dir="./",
+ threshold=0.5,
+ ):
super(DetectorSOLOv2, self).__init__(
model_dir=model_dir,
device=device,
@@ -606,7 +600,8 @@ def __init__(
enable_mkldnn=enable_mkldnn,
enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
output_dir=output_dir,
- threshold=threshold, )
+ threshold=threshold,
+ )
def predict(self, repeats=1, run_benchmark=False):
"""
@@ -617,37 +612,24 @@ def predict(self, repeats=1, run_benchmark=False):
'cate_label': label of segm, shape:[N]
'cate_score': confidence score of segm, shape:[N]
"""
- np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array(
- [0])
+ np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array([0])
if run_benchmark:
for i in range(repeats):
self.predictor.run()
paddle.device.cuda.synchronize()
- result = dict(
- segm=np_segms,
- label=np_label,
- score=np_score,
- boxes_num=np_boxes_num)
+ result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
return result
for i in range(repeats):
self.predictor.run()
output_names = self.predictor.get_output_names()
- np_boxes_num = self.predictor.get_output_handle(output_names[
- 0]).copy_to_cpu()
- np_label = self.predictor.get_output_handle(output_names[
- 1]).copy_to_cpu()
- np_score = self.predictor.get_output_handle(output_names[
- 2]).copy_to_cpu()
- np_segms = self.predictor.get_output_handle(output_names[
- 3]).copy_to_cpu()
-
- result = dict(
- segm=np_segms,
- label=np_label,
- score=np_score,
- boxes_num=np_boxes_num)
+ np_boxes_num = self.predictor.get_output_handle(output_names[0]).copy_to_cpu()
+ np_label = self.predictor.get_output_handle(output_names[1]).copy_to_cpu()
+ np_score = self.predictor.get_output_handle(output_names[2]).copy_to_cpu()
+ np_segms = self.predictor.get_output_handle(output_names[3]).copy_to_cpu()
+
+ result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
return result
@@ -669,20 +651,21 @@ class DetectorPicoDet(Detector):
"""
def __init__(
- self,
- model_dir,
- device="CPU",
- run_mode="paddle",
- batch_size=1,
- trt_min_shape=1,
- trt_max_shape=1280,
- trt_opt_shape=640,
- trt_calib_mode=False,
- cpu_threads=1,
- enable_mkldnn=False,
- enable_mkldnn_bfloat16=False,
- output_dir="./",
- threshold=0.5, ):
+ self,
+ model_dir,
+ device="CPU",
+ run_mode="paddle",
+ batch_size=1,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False,
+ cpu_threads=1,
+ enable_mkldnn=False,
+ enable_mkldnn_bfloat16=False,
+ output_dir="./",
+ threshold=0.5,
+ ):
super(DetectorPicoDet, self).__init__(
model_dir=model_dir,
device=device,
@@ -696,7 +679,8 @@ def __init__(
enable_mkldnn=enable_mkldnn,
enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
output_dir=output_dir,
- threshold=threshold, )
+ threshold=threshold,
+ )
def postprocess(self, inputs, result):
# postprocess output of predictor
@@ -707,7 +691,8 @@ def postprocess(self, inputs, result):
inputs["im_shape"],
inputs["scale_factor"],
strides=self.pred_config.fpn_stride,
- nms_threshold=self.pred_config.nms["nms_threshold"], )
+ nms_threshold=self.pred_config.nms["nms_threshold"],
+ )
np_boxes, np_boxes_num = postprocessor(np_score_list, np_boxes_list)
result = dict(boxes=np_boxes, boxes_num=np_boxes_num)
return result
@@ -736,12 +721,8 @@ def predict(self, repeats=1, run_benchmark=False):
output_names = self.predictor.get_output_names()
num_outs = int(len(output_names) / 2)
for out_idx in range(num_outs):
- np_score_list.append(
- self.predictor.get_output_handle(output_names[out_idx])
- .copy_to_cpu())
- np_boxes_list.append(
- self.predictor.get_output_handle(output_names[
- out_idx + num_outs]).copy_to_cpu())
+ np_score_list.append(self.predictor.get_output_handle(output_names[out_idx]).copy_to_cpu())
+ np_boxes_list.append(self.predictor.get_output_handle(output_names[out_idx + num_outs]).copy_to_cpu())
result = dict(boxes=np_score_list, boxes_num=np_boxes_list)
return result
@@ -759,16 +740,14 @@ def create_inputs(imgs, im_info):
im_shape = []
scale_factor = []
if len(imgs) == 1:
- inputs["image"] = np.array((imgs[0], )).astype("float32")
- inputs["im_shape"] = np.array(
- (im_info[0]["im_shape"], )).astype("float32")
- inputs["scale_factor"] = np.array(
- (im_info[0]["scale_factor"], )).astype("float32")
+ inputs["image"] = np.array((imgs[0],)).astype("float32")
+ inputs["im_shape"] = np.array((im_info[0]["im_shape"],)).astype("float32")
+ inputs["scale_factor"] = np.array((im_info[0]["scale_factor"],)).astype("float32")
return inputs
for e in im_info:
- im_shape.append(np.array((e["im_shape"], )).astype("float32"))
- scale_factor.append(np.array((e["scale_factor"], )).astype("float32"))
+ im_shape.append(np.array((e["im_shape"],)).astype("float32"))
+ scale_factor.append(np.array((e["scale_factor"],)).astype("float32"))
inputs["im_shape"] = np.concatenate(im_shape, axis=0)
inputs["scale_factor"] = np.concatenate(scale_factor, axis=0)
@@ -779,8 +758,7 @@ def create_inputs(imgs, im_info):
padding_imgs = []
for img in imgs:
im_c, im_h, im_w = img.shape[:]
- padding_im = np.zeros(
- (im_c, max_shape_h, max_shape_w), dtype=np.float32)
+ padding_im = np.zeros((im_c, max_shape_h, max_shape_w), dtype=np.float32)
padding_im[:, :im_h, :im_w] = img
padding_imgs.append(padding_im)
inputs["image"] = np.stack(padding_imgs, axis=0)
@@ -815,9 +793,7 @@ def __init__(self, model_dir):
if "fpn_stride" in yml_conf:
self.fpn_stride = yml_conf["fpn_stride"]
if self.arch == "RCNN" and yml_conf.get("export_onnx", False):
- print(
- "The RCNN export model is used for ONNX and it only supports batch_size = 1"
- )
+ print("The RCNN export model is used for ONNX and it only supports batch_size = 1")
self.print_config()
def check_model(self, yml_conf):
@@ -828,8 +804,7 @@ def check_model(self, yml_conf):
for support_model in SUPPORT_MODELS:
if support_model in yml_conf["arch"]:
return True
- raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
- "arch"], SUPPORT_MODELS))
+ raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], SUPPORT_MODELS))
def print_config(self):
print("----------- Model Configuration -----------")
@@ -841,22 +816,23 @@ def print_config(self):
def load_predictor(
- model_dir,
- arch,
- run_mode="paddle",
- batch_size=1,
- device="CPU",
- min_subgraph_size=3,
- use_dynamic_shape=False,
- trt_min_shape=1,
- trt_max_shape=1280,
- trt_opt_shape=640,
- trt_calib_mode=False,
- cpu_threads=1,
- enable_mkldnn=False,
- enable_mkldnn_bfloat16=False,
- delete_shuffle_pass=False,
- tuned_trt_shape_file="shape_range_info.pbtxt", ):
+ model_dir,
+ arch,
+ run_mode="paddle",
+ batch_size=1,
+ device="CPU",
+ min_subgraph_size=3,
+ use_dynamic_shape=False,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False,
+ cpu_threads=1,
+ enable_mkldnn=False,
+ enable_mkldnn_bfloat16=False,
+ delete_shuffle_pass=False,
+ tuned_trt_shape_file="shape_range_info.pbtxt",
+):
"""set AnalysisConfig, generate AnalysisPredictor
Args:
model_dir (str): root path of __model__ and __params__
@@ -877,16 +853,15 @@ def load_predictor(
"""
if device != "GPU" and run_mode != "paddle":
raise ValueError(
- "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".
- format(run_mode, device))
+ "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device)
+ )
infer_model = os.path.join(model_dir, "model.pdmodel")
infer_params = os.path.join(model_dir, "model.pdiparams")
if not os.path.exists(infer_model):
infer_model = os.path.join(model_dir, "inference.pdmodel")
infer_params = os.path.join(model_dir, "inference.pdiparams")
if not os.path.exists(infer_model):
- raise ValueError(
- "Cannot find any inference model in dir: {},".format(model_dir))
+ raise ValueError("Cannot find any inference model in dir: {},".format(model_dir))
config = Config(infer_model, infer_params)
if device == "GPU":
# initial GPU memory(M), device ID
@@ -912,9 +887,7 @@ def load_predictor(
if enable_mkldnn_bfloat16:
config.enable_mkldnn_bfloat16()
except:
- print(
- "The current environment does not support `mkldnn`, so disable mkldnn."
- )
+ print("The current environment does not support `mkldnn`, so disable mkldnn.")
pass
precision_map = {
@@ -931,10 +904,10 @@ def load_predictor(
min_subgraph_size=min_subgraph_size,
precision_mode=precision_map[run_mode],
use_static=False,
- use_calib_mode=trt_calib_mode, )
+ use_calib_mode=trt_calib_mode,
+ )
if arch in TUNED_TRT_DYNAMIC_MODELS:
- config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file,
- True)
+ config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file, True)
if use_dynamic_shape:
min_input_shape = {
@@ -949,8 +922,7 @@ def load_predictor(
"image": [batch_size, 3, trt_opt_shape, trt_opt_shape],
"scale_factor": [batch_size, 2],
}
- config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
- opt_input_shape)
+ config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
print("trt set dynamic shape done!")
# disable print log when predict
@@ -969,12 +941,9 @@ def get_test_images(infer_dir, infer_img):
"""
Get image path list in TEST mode
"""
- assert (infer_img is not None or
- infer_dir is not None), "--image_file or --image_dir should be set"
- assert infer_img is None or os.path.isfile(
- infer_img), "{} is not a file".format(infer_img)
- assert infer_dir is None or os.path.isdir(
- infer_dir), "{} is not a directory".format(infer_dir)
+ assert infer_img is not None or infer_dir is not None, "--image_file or --image_dir should be set"
+ assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img)
+ assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir)
# infer_img has a higher priority
if infer_img and os.path.isfile(infer_img):
@@ -982,8 +951,7 @@ def get_test_images(infer_dir, infer_img):
images = set()
infer_dir = os.path.abspath(infer_dir)
- assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(
- infer_dir)
+ assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir)
exts = ["jpg", "jpeg", "png", "bmp"]
exts += [ext.upper() for ext in exts]
for ext in exts:
@@ -1003,24 +971,18 @@ def visualize(image_list, result, labels, output_dir="output/", threshold=0.5):
im_bboxes_num = result["boxes_num"][idx]
im_results = {}
if "boxes" in result:
- im_results["boxes"] = result["boxes"][start_idx:start_idx +
- im_bboxes_num, :]
+ im_results["boxes"] = result["boxes"][start_idx : start_idx + im_bboxes_num, :]
if "masks" in result:
- im_results["masks"] = result["masks"][start_idx:start_idx +
- im_bboxes_num, :]
+ im_results["masks"] = result["masks"][start_idx : start_idx + im_bboxes_num, :]
if "segm" in result:
- im_results["segm"] = result["segm"][start_idx:start_idx +
- im_bboxes_num, :]
+ im_results["segm"] = result["segm"][start_idx : start_idx + im_bboxes_num, :]
if "label" in result:
- im_results["label"] = result["label"][start_idx:start_idx +
- im_bboxes_num]
+ im_results["label"] = result["label"][start_idx : start_idx + im_bboxes_num]
if "score" in result:
- im_results["score"] = result["score"][start_idx:start_idx +
- im_bboxes_num]
+ im_results["score"] = result["score"][start_idx : start_idx + im_bboxes_num]
start_idx += im_bboxes_num
- im = visualize_box_mask(
- image_file, im_results, labels, threshold=threshold)
+ im = visualize_box_mask(image_file, im_results, labels, threshold=threshold)
img_name = os.path.split(image_file)[-1]
if not os.path.exists(output_dir):
os.makedirs(output_dir)
@@ -1060,7 +1022,8 @@ def main():
enable_mkldnn=FLAGS.enable_mkldnn,
enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16,
threshold=FLAGS.threshold,
- output_dir=FLAGS.output_dir, )
+ output_dir=FLAGS.output_dir,
+ )
# predict from video file or camera video stream
if FLAGS.video_file is not None or FLAGS.camera_id != -1:
@@ -1068,8 +1031,7 @@ def main():
else:
# predict from image
if FLAGS.image_dir is None and FLAGS.image_file is not None:
- assert (FLAGS.batch_size == 1
- ), "batch_size should be 1, when image_file is not None"
+ assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
if FLAGS.slice_infer:
detector.predict_image_slice(
@@ -1080,14 +1042,16 @@ def main():
FLAGS.match_threshold,
FLAGS.match_metric,
visual=FLAGS.save_images,
- save_results=FLAGS.save_results, )
+ save_results=FLAGS.save_results,
+ )
else:
detector.predict_image(
img_list,
FLAGS.run_benchmark,
repeats=100,
visual=FLAGS.save_images,
- save_results=FLAGS.save_results, )
+ save_results=FLAGS.save_results,
+ )
if not FLAGS.run_benchmark:
detector.det_times.info(average=True)
else:
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
index 8f661fb65fe6b..fa3551f584493 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
@@ -52,20 +52,21 @@ class KeyPointDetector(Detector):
"""
def __init__(
- self,
- model_dir,
- device="CPU",
- run_mode="paddle",
- batch_size=1,
- trt_min_shape=1,
- trt_max_shape=1280,
- trt_opt_shape=640,
- trt_calib_mode=False,
- cpu_threads=1,
- enable_mkldnn=False,
- output_dir="output",
- threshold=0.5,
- use_dark=True, ):
+ self,
+ model_dir,
+ device="CPU",
+ run_mode="paddle",
+ batch_size=1,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False,
+ cpu_threads=1,
+ enable_mkldnn=False,
+ output_dir="output",
+ threshold=0.5,
+ use_dark=True,
+ ):
super(KeyPointDetector, self).__init__(
model_dir=model_dir,
device=device,
@@ -78,7 +79,8 @@ def __init__(
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn,
output_dir=output_dir,
- threshold=threshold, )
+ threshold=threshold,
+ )
self.use_dark = use_dark
def set_config(self, model_dir):
@@ -105,8 +107,7 @@ def postprocess(self, inputs, result):
np_heatmap = result["heatmap"]
np_masks = result["masks"]
# postprocess output of predictor
- if KEYPOINT_SUPPORT_MODELS[
- self.pred_config.arch] == "keypoint_bottomup":
+ if KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_bottomup":
results = {}
h, w = inputs["im_shape"][0]
preds = [np_heatmap]
@@ -118,8 +119,7 @@ def postprocess(self, inputs, result):
results["keypoint"] = kpts
results["score"] = scores
return results
- elif KEYPOINT_SUPPORT_MODELS[
- self.pred_config.arch] == "keypoint_topdown":
+ elif KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_topdown":
results = {}
imshape = inputs["im_shape"][:, ::-1]
center = np.round(imshape / 2.0)
@@ -130,8 +130,7 @@ def postprocess(self, inputs, result):
results["score"] = scores
return results
else:
- raise ValueError("Unsupported arch: {}, expect {}".format(
- self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
+ raise ValueError("Unsupported arch: {}, expect {}".format(self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
def predict(self, repeats=1):
"""
@@ -162,11 +161,7 @@ def predict(self, repeats=1):
result = dict(heatmap=np_heatmap, masks=np_masks)
return result
- def predict_image(self,
- image_list,
- run_benchmark=False,
- repeats=1,
- visual=True):
+ def predict_image(self, image_list, run_benchmark=False, repeats=1, visual=True):
results = []
batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
for i in range(batch_loop_cnt):
@@ -222,7 +217,8 @@ def predict_image(self,
batch_image_list,
result,
visual_thresh=self.threshold,
- save_dir=self.output_dir, )
+ save_dir=self.output_dir,
+ )
results.append(result)
results = self.merge_batch_result(results)
@@ -245,7 +241,7 @@ def predict_video(self, video_file, camera_id):
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
out_path = os.path.join(self.output_dir, video_name)
- fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
index = 1
while 1:
@@ -257,8 +253,7 @@ def predict_video(self, video_file, camera_id):
results = self.predict_image([frame[:, :, ::-1]], visual=False)
im_results = {}
im_results["keypoint"] = [results["keypoint"], results["score"]]
- im = visualize_pose(
- frame, im_results, visual_thresh=self.threshold, returnimg=True)
+ im = visualize_pose(frame, im_results, visual_thresh=self.threshold, returnimg=True)
writer.write(im)
if camera_id != -1:
cv2.imshow("Mask Detection", im)
@@ -315,8 +310,7 @@ def check_model(self, yml_conf):
for support_model in KEYPOINT_SUPPORT_MODELS:
if support_model in yml_conf["arch"]:
return True
- raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
- "arch"], KEYPOINT_SUPPORT_MODELS))
+ raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], KEYPOINT_SUPPORT_MODELS))
def print_config(self):
print("----------- Model Configuration -----------")
@@ -332,14 +326,10 @@ def visualize(image_list, results, visual_thresh=0.6, save_dir="output"):
for i, image_file in enumerate(image_list):
skeletons = results["keypoint"]
scores = results["score"]
- skeleton = skeletons[i:i + 1]
- score = scores[i:i + 1]
+ skeleton = skeletons[i : i + 1]
+ score = scores[i : i + 1]
im_results["keypoint"] = [skeleton, score]
- visualize_pose(
- image_file,
- im_results,
- visual_thresh=visual_thresh,
- save_dir=save_dir)
+ visualize_pose(image_file, im_results, visual_thresh=visual_thresh, save_dir=save_dir)
def main():
@@ -356,7 +346,8 @@ def main():
enable_mkldnn=FLAGS.enable_mkldnn,
threshold=FLAGS.threshold,
output_dir=FLAGS.output_dir,
- use_dark=FLAGS.use_dark, )
+ use_dark=FLAGS.use_dark,
+ )
# predict from video file or camera video stream
if FLAGS.video_file is not None or FLAGS.camera_id != -1:
@@ -385,8 +376,7 @@ def main():
"shape": "dynamic_shape",
"data_num": perf_info["img_num"],
}
- det_log = PaddleInferBenchmark(detector.config, model_info,
- data_info, perf_info, mems)
+ det_log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
det_log("KeyPoint")
@@ -396,8 +386,7 @@ def main():
FLAGS = parser.parse_args()
print_arguments(FLAGS)
FLAGS.device = FLAGS.device.upper()
- assert FLAGS.device in ["CPU", "GPU", "XPU"
- ], "device should be CPU, GPU or XPU"
+ assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
index 01aa825cb00ee..8ba1f6a47b0cd 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
@@ -50,14 +50,11 @@ def lerp(self, j, y, x, heatmap):
right = np.clip(x + 1, 0, W - 1)
up = np.clip(y - 1, 0, H - 1)
down = np.clip(y + 1, 0, H - 1)
- offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
- -0.25)
- offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
- -0.25)
+ offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, -0.25)
+ offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, -0.25)
return offset_y + 0.5, offset_x + 0.5
- def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
- original_width):
+ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, original_width):
N, J, H, W = heatmap.shape
assert N == 1, "only support batch size 1"
@@ -67,8 +64,9 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
inds_np = inds_k[0]
y = inds_np // W
x = inds_np % W
- tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
- y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
+ tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.flatten(), x.flatten()].reshape(
+ J, -1, tagmap.shape[-1]
+ )
coords = np.stack((y, x), axis=2)
# threshold
mask = heats > self.heat_thresh
@@ -94,11 +92,8 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
cluster[key]["scores"][jid] = heats[jid, i]
cluster[key]["coords"][jid] = coords[jid, i]
continue
- candidates = list(cluster.keys())[:self.max_num_people]
- centroids = [
- np.mean(
- cluster[k]["tags"], axis=0) for k in candidates
- ]
+ candidates = list(cluster.keys())[: self.max_num_people]
+ centroids = [np.mean(cluster[k]["tags"], axis=0) for k in candidates]
num_clusters = len(centroids)
# shape is (num_valid, num_clusters, tag_dim)
dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
@@ -111,12 +106,12 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
cost,
((0, 0), (0, num_valid - num_clusters)),
"constant",
- constant_values=((0, 0), (0, 1e-10)), )
+ constant_values=((0, 0), (0, 1e-10)),
+ )
rows, cols = linear_sum_assignment(cost)
for y, x in zip(rows, cols):
tag = tags[jid, y]
- if (y < num_valid and x < num_clusters and
- l2_dist[y, x] < self.tag_thresh):
+ if y < num_valid and x < num_clusters and l2_dist[y, x] < self.tag_thresh:
key = candidates[x] # merge to cluster
else:
key = tag[0] # initialize new cluster
@@ -151,7 +146,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
if True:
for pid, coords in enumerate(pose_coords):
tag_mean = np.array(pose_tags[pid]).mean(axis=0)
- norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
+ norm = np.sum((tagmap - tag_mean) ** 2, axis=3) ** 0.5
score = heatmap - np.round(norm) # (J, H, W)
flat_score = score.reshape(J, -1)
max_inds = np.argmax(flat_score, axis=1)
@@ -167,9 +162,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
pose_coords[pid][salvage_joints, 0] = y
pose_coords[pid][salvage_joints, 1] = x
pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
- pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
- original_height, original_width,
- min(H, W))
+ pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], original_height, original_width, min(H, W))
return pose_kpts, mean_score
@@ -193,9 +186,7 @@ def warp_affine_joints(joints, mat):
joints = np.array(joints)
shape = joints.shape
joints = joints.reshape(-1, 2)
- return np.dot(np.concatenate(
- (joints, joints[:, 0:1] * 0 + 1), axis=1),
- mat.T).reshape(shape)
+ return np.dot(np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), mat.T).reshape(shape)
class HRNetPostProcess(object):
@@ -203,9 +194,7 @@ def __init__(self, use_dark=True):
self.use_dark = use_dark
def flip_back(self, output_flipped, matched_parts):
- assert (
- output_flipped.ndim == 4
- ), "output_flipped should be [batch_size, num_joints, height, width]"
+ assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_joints, height, width]"
output_flipped = output_flipped[:, :, :, ::-1]
@@ -226,8 +215,7 @@ def get_max_preds(self, heatmaps):
preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
"""
- assert isinstance(heatmaps,
- np.ndarray), "heatmaps should be numpy.ndarray"
+ assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray"
assert heatmaps.ndim == 4, "batch_images should be 4-ndim"
batch_size = heatmaps.shape[0]
@@ -277,10 +265,8 @@ def dark_parse(self, hm, coord):
dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
- dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] -
- hm[py + 1][px - 1] + hm[py - 1][px - 1])
- dyy = 0.25 * (
- hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
+ dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] - hm[py + 1][px - 1] + hm[py - 1][px - 1])
+ dyy = 0.25 * (hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
derivative = np.matrix([[dx], [dy]])
hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
if dxx * dyy - dxy**2 != 0:
@@ -331,25 +317,24 @@ def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
px = int(math.floor(coords[n][p][0] + 0.5))
py = int(math.floor(coords[n][p][1] + 0.5))
if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
- diff = np.array([
- hm[py][px + 1] - hm[py][px - 1],
- hm[py + 1][px] - hm[py - 1][px],
- ])
+ diff = np.array(
+ [
+ hm[py][px + 1] - hm[py][px - 1],
+ hm[py + 1][px] - hm[py - 1][px],
+ ]
+ )
coords[n][p] += np.sign(diff) * 0.25
preds = coords.copy()
# Transform back
for i in range(coords.shape[0]):
- preds[i] = transform_preds(coords[i], center[i], scale[i],
- [heatmap_width, heatmap_height])
+ preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height])
return preds, maxvals
def __call__(self, output, center, scale):
preds, maxvals = self.get_final_preds(output, center, scale)
- return np.concatenate(
- (preds, maxvals), axis=-1), np.mean(
- maxvals, axis=1)
+ return np.concatenate((preds, maxvals), axis=-1), np.mean(maxvals, axis=1)
def transform_preds(coords, center, scale, output_size):
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
index 86bf7e57c6605..68173f62bd043 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
@@ -48,18 +48,12 @@ def get_affine_mat_kernel(h, w, s, inv=False):
center = np.array([np.round(w / 2.0), np.round(h / 2.0)])
size_resized = (w_, h_)
- trans = get_affine_transform(
- center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
+ trans = get_affine_transform(center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
return trans, size_resized
-def get_affine_transform(center,
- input_size,
- rot,
- output_size,
- shift=(0.0, 0.0),
- inv=False):
+def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False):
"""Get the affine transform matrix, given the center/scale/rot/output_size.
Args:
@@ -134,13 +128,13 @@ def get_warp_matrix(theta, size_input, size_dst, size_target):
matrix[0, 0] = np.cos(theta) * scale_x
matrix[0, 1] = -np.sin(theta) * scale_x
matrix[0, 2] = scale_x * (
- -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
- np.sin(theta) + 0.5 * size_target[0])
+ -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * np.sin(theta) + 0.5 * size_target[0]
+ )
matrix[1, 0] = np.sin(theta) * scale_y
matrix[1, 1] = np.cos(theta) * scale_y
matrix[1, 2] = scale_y * (
- -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
- np.cos(theta) + 0.5 * size_target[1])
+ -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * np.cos(theta) + 0.5 * size_target[1]
+ )
return matrix
@@ -212,19 +206,22 @@ def __call__(self, image, im_info):
rot,
center * 2.0,
[self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
- scale, )
+ scale,
+ )
image = cv2.warpAffine(
image,
trans,
(int(self.trainsize[0]), int(self.trainsize[1])),
- flags=cv2.INTER_LINEAR, )
+ flags=cv2.INTER_LINEAR,
+ )
else:
trans = get_affine_transform(center, scale, rot, self.trainsize)
image = cv2.warpAffine(
image,
trans,
(int(self.trainsize[0]), int(self.trainsize[1])),
- flags=cv2.INTER_LINEAR, )
+ flags=cv2.INTER_LINEAR,
+ )
return image, im_info
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
index e858fa5051eaf..aa9b060ce7059 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
@@ -41,8 +41,8 @@ def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
rest_boxes = boxes[indexes, :]
iou = iou_of(
rest_boxes,
- np.expand_dims(
- current_box, axis=0), )
+ np.expand_dims(current_box, axis=0),
+ )
indexes = indexes[iou <= iou_threshold]
return box_scores[picked, :]
@@ -88,15 +88,16 @@ class PicoDetPostProcess(object):
"""
def __init__(
- self,
- input_shape,
- ori_shape,
- scale_factor,
- strides=[8, 16, 32, 64],
- score_threshold=0.4,
- nms_threshold=0.5,
- nms_top_k=1000,
- keep_top_k=100, ):
+ self,
+ input_shape,
+ ori_shape,
+ scale_factor,
+ strides=[8, 16, 32, 64],
+ score_threshold=0.4,
+ nms_threshold=0.5,
+ nms_top_k=1000,
+ keep_top_k=100,
+ ):
self.ori_shape = ori_shape
self.input_shape = input_shape
self.scale_factor = scale_factor
@@ -113,15 +114,13 @@ def warp_boxes(self, boxes, ori_shape):
if n:
# warp points
xy = np.ones((n * 4, 3))
- xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
- n * 4, 2) # x1y1, x2y2, x1y2, x2y1
+ xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1
# xy = xy @ M.T # transform
xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale
# create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
- xy = (np.concatenate(
- (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T)
+ xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
# clip boxes
xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
@@ -138,8 +137,7 @@ def __call__(self, scores, raw_boxes):
# generate centers
decode_boxes = []
select_scores = []
- for stride, box_distribute, score in zip(self.strides, raw_boxes,
- scores):
+ for stride, box_distribute, score in zip(self.strides, raw_boxes, scores):
box_distribute = box_distribute[batch_id]
score = score[batch_id]
# centers
@@ -162,7 +160,7 @@ def __call__(self, scores, raw_boxes):
# top K candidate
topk_idx = np.argsort(score.max(axis=1))[::-1]
- topk_idx = topk_idx[:self.nms_top_k]
+ topk_idx = topk_idx[: self.nms_top_k]
center = center[topk_idx]
score = score[topk_idx]
box_distance = box_distance[topk_idx]
@@ -185,12 +183,12 @@ def __call__(self, scores, raw_boxes):
if probs.shape[0] == 0:
continue
subset_boxes = bboxes[mask, :]
- box_probs = np.concatenate(
- [subset_boxes, probs.reshape(-1, 1)], axis=1)
+ box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
box_probs = hard_nms(
box_probs,
iou_threshold=self.nms_threshold,
- top_k=self.keep_top_k, )
+ top_k=self.keep_top_k,
+ )
picked_box_probs.append(box_probs)
picked_labels.extend([class_index] * box_probs.shape[0])
@@ -202,24 +200,25 @@ def __call__(self, scores, raw_boxes):
picked_box_probs = np.concatenate(picked_box_probs)
# resize output boxes
- picked_box_probs[:, :4] = self.warp_boxes(
- picked_box_probs[:, :4], self.ori_shape[batch_id])
- im_scale = np.concatenate([
- self.scale_factor[batch_id][::-1],
- self.scale_factor[batch_id][::-1],
- ])
+ picked_box_probs[:, :4] = self.warp_boxes(picked_box_probs[:, :4], self.ori_shape[batch_id])
+ im_scale = np.concatenate(
+ [
+ self.scale_factor[batch_id][::-1],
+ self.scale_factor[batch_id][::-1],
+ ]
+ )
picked_box_probs[:, :4] /= im_scale
# clas score box
out_boxes_list.append(
np.concatenate(
[
- np.expand_dims(
- np.array(picked_labels), axis=-1),
- np.expand_dims(
- picked_box_probs[:, 4], axis=-1),
+ np.expand_dims(np.array(picked_labels), axis=-1),
+ np.expand_dims(picked_box_probs[:, 4], axis=-1),
picked_box_probs[:, :4],
],
- axis=1, ))
+ axis=1,
+ )
+ )
out_boxes_num.append(len(picked_labels))
out_boxes_list = np.concatenate(out_boxes_list, axis=0)
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
index 1066879f2e9ad..e57404bfe6c10 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
@@ -64,16 +64,9 @@ def __call__(self, im, im_info):
im_info (dict): info of processed image
"""
im_scale_y, im_scale_x = self.generate_scale(im)
- im = cv2.resize(
- im,
- None,
- None,
- fx=im_scale_x,
- fy=im_scale_y,
- interpolation=self.interp)
+ im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
- im_info["scale_factor"] = np.array(
- [im_scale_y, im_scale_x]).astype("float32")
+ im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
return im, im_info
def generate_scale(self, img):
@@ -140,16 +133,9 @@ def __call__(self, im, im_info):
assert len(self.target_size) == 2
assert self.target_size[0] > 0 and self.target_size[1] > 0
im_scale_y, im_scale_x = self.generate_scale(im)
- im = cv2.resize(
- im,
- None,
- None,
- fx=im_scale_x,
- fy=im_scale_y,
- interpolation=self.interp)
+ im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
- im_info["scale_factor"] = np.array(
- [im_scale_y, im_scale_x]).astype("float32")
+ im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
return im, im_info
def generate_scale(self, im):
@@ -189,12 +175,13 @@ class ShortSizeScale(object):
"""
def __init__(
- self,
- short_size,
- fixed_ratio=True,
- keep_ratio=None,
- do_round=False,
- backend="pillow", ):
+ self,
+ short_size,
+ fixed_ratio=True,
+ keep_ratio=None,
+ do_round=False,
+ backend="pillow",
+ ):
self.short_size = short_size
assert (fixed_ratio and not keep_ratio) or (
not fixed_ratio
@@ -236,10 +223,8 @@ def __call__(self, img):
oh = self.short_size
else:
scale_factor = self.short_size / w
- oh = (int(h * float(scale_factor) + 0.5)
- if self.do_round else int(h * self.short_size / w))
- ow = (int(w * float(scale_factor) + 0.5)
- if self.do_round else int(w * self.short_size / h))
+ oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
+ ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
else:
oh = self.short_size
if self.fixed_ratio:
@@ -248,10 +233,8 @@ def __call__(self, img):
ow = self.short_size
else:
scale_factor = self.short_size / h
- oh = (int(h * float(scale_factor) + 0.5)
- if self.do_round else int(h * self.short_size / w))
- ow = (int(w * float(scale_factor) + 0.5)
- if self.do_round else int(w * self.short_size / h))
+ oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
+ ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
if type(img) == np.ndarray:
img = Image.fromarray(img, mode="RGB")
@@ -259,12 +242,9 @@ def __call__(self, img):
if self.backend == "pillow":
result_img = img.resize((ow, oh), Image.BILINEAR)
elif self.backend == "cv2" and (self.keep_ratio is not None):
- result_img = cv2.resize(
- img, (ow, oh), interpolation=cv2.INTER_LINEAR)
+ result_img = cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR)
else:
- result_img = Image.fromarray(
- cv2.resize(
- np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
+ result_img = Image.fromarray(cv2.resize(np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
return result_img
@@ -313,7 +293,9 @@ class Permute(object):
channel_first (bool): whether convert HWC to CHW
"""
- def __init__(self, ):
+ def __init__(
+ self,
+ ):
super(Permute, self).__init__()
def __call__(self, im, im_info):
@@ -379,17 +361,15 @@ def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
ratio = min(ratio_h, ratio_w)
new_shape = (
round(shape[1] * ratio),
- round(shape[0] * ratio), ) # [width, height]
+ round(shape[0] * ratio),
+ ) # [width, height]
padw = (width - new_shape[0]) / 2
padh = (height - new_shape[1]) / 2
top, bottom = round(padh - 0.1), round(padh + 0.1)
left, right = round(padw - 0.1), round(padw + 0.1)
- img = cv2.resize(
- img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
- img = cv2.copyMakeBorder(
- img, top, bottom, left, right, cv2.BORDER_CONSTANT,
- value=color) # padded rectangular
+ img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
+ img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular
return img, ratio, padw, padh
def __call__(self, im, im_info):
@@ -445,14 +425,15 @@ class WarpAffine(object):
"""Warp affine the image"""
def __init__(
- self,
- keep_res=False,
- pad=31,
- input_h=512,
- input_w=512,
- scale=0.4,
- shift=0.1,
- down_ratio=4, ):
+ self,
+ keep_res=False,
+ pad=31,
+ input_h=512,
+ input_w=512,
+ scale=0.4,
+ shift=0.1,
+ down_ratio=4,
+ ):
self.keep_res = keep_res
self.pad = pad
self.input_h = input_h
@@ -489,32 +470,32 @@ def __call__(self, im, im_info):
trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
img = cv2.resize(img, (w, h))
- inp = cv2.warpAffine(
- img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+ inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
if not self.keep_res:
out_h = input_h // self.down_ratio
out_w = input_w // self.down_ratio
trans_output = get_affine_transform(c, s, 0, [out_w, out_h])
- im_info.update({
- "center": c,
- "scale": s,
- "out_height": out_h,
- "out_width": out_w,
- "inp_height": input_h,
- "inp_width": input_w,
- "trans_input": trans_input,
- "trans_output": trans_output,
- })
+ im_info.update(
+ {
+ "center": c,
+ "scale": s,
+ "out_height": out_h,
+ "out_width": out_w,
+ "inp_height": input_h,
+ "inp_width": input_w,
+ "trans_input": trans_input,
+ "trans_output": trans_output,
+ }
+ )
return inp, im_info
def preprocess(im, preprocess_ops):
# process image by preprocess_ops
im_info = {
- "scale_factor": np.array(
- [1.0, 1.0], dtype=np.float32),
+ "scale_factor": np.array([1.0, 1.0], dtype=np.float32),
"im_shape": None,
}
im, im_info = decode_image(im, im_info)
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
index 179b3b366e15a..1d38777a4526c 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
@@ -46,8 +46,7 @@ def pad_right_down_corner(img, stride, padValue):
def transfer(model, model_weights):
transfered_model_weights = {}
for weights_name in model.state_dict().keys():
- transfered_model_weights[weights_name] = model_weights[".".join(
- weights_name.split(".")[1:])]
+ transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
return transfered_model_weights
@@ -113,11 +112,9 @@ def draw_bodypose(canvas, candidate, subset):
X = candidate[index.astype(int), 1]
mX = np.mean(X)
mY = np.mean(Y)
- length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
- polygon = cv2.ellipse2Poly((int(mY), int(mX)),
- (int(length / 2), stickwidth),
- int(angle), 0, 360, 1)
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
return canvas
@@ -156,9 +153,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
canvas,
(x1, y1),
(x2, y2),
- matplotlib.colors.hsv_to_rgb(
- [ie / float(len(edges)), 1.0, 1.0]) * 255,
- thickness=2, )
+ matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
+ thickness=2,
+ )
for i, keyponit in enumerate(peaks):
x, y = keyponit
@@ -171,7 +168,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
cv2.FONT_HERSHEY_SIMPLEX,
0.3,
(0, 0, 0),
- lineType=cv2.LINE_AA, )
+ lineType=cv2.LINE_AA,
+ )
return canvas
@@ -192,16 +190,14 @@ def hand_detect(candidate, subset, oriImg):
hands = []
# left hand
if has_left:
- left_shoulder_index, left_elbow_index, left_wrist_index = person[
- [5, 6, 7]]
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
x1, y1 = candidate[left_shoulder_index][:2]
x2, y2 = candidate[left_elbow_index][:2]
x3, y3 = candidate[left_wrist_index][:2]
hands.append([x1, y1, x2, y2, x3, y3, True])
# right hand
if has_right:
- right_shoulder_index, right_elbow_index, right_wrist_index = person[
- [2, 3, 4]]
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
x1, y1 = candidate[right_shoulder_index][:2]
x2, y2 = candidate[right_elbow_index][:2]
x3, y3 = candidate[right_wrist_index][:2]
@@ -216,8 +212,8 @@ def hand_detect(candidate, subset, oriImg):
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
x = x3 + ratioWristElbow * (x3 - x2)
y = y3 + ratioWristElbow * (y3 - y2)
- distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2)
- distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
width = 1.0 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
# x-y refers to the center --> offset to topLeft point
# handRectangle.x -= handRectangle.width / 2.f;
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
index eba62c30d1e34..eb3856ca3a117 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
@@ -26,41 +26,40 @@ def argsparser():
"--model_dir",
type=str,
default=None,
- help=("Directory include:'model.pdiparams', 'model.pdmodel', "
- "'infer_cfg.yml', created by tools/export_model.py."),
- required=True, )
- parser.add_argument(
- "--image_file", type=str, default=None, help="Path of image file.")
+ help=(
+ "Directory include:'model.pdiparams', 'model.pdmodel', "
+ "'infer_cfg.yml', created by tools/export_model.py."
+ ),
+ required=True,
+ )
+ parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
parser.add_argument(
"--image_dir",
type=str,
default=None,
- help="Dir of image file, `image_file` has a higher priority.", )
- parser.add_argument(
- "--batch_size", type=int, default=1, help="batch_size for inference.")
+ help="Dir of image file, `image_file` has a higher priority.",
+ )
+ parser.add_argument("--batch_size", type=int, default=1, help="batch_size for inference.")
parser.add_argument(
"--video_file",
type=str,
default=None,
help="Path of video file, `video_file` or `camera_id` has a highest priority.",
)
- parser.add_argument(
- "--camera_id",
- type=int,
- default=-1,
- help="device id of camera to predict.")
- parser.add_argument(
- "--threshold", type=float, default=0.5, help="Threshold of score.")
+ parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
+ parser.add_argument("--threshold", type=float, default=0.5, help="Threshold of score.")
parser.add_argument(
"--output_dir",
type=str,
default="output",
- help="Directory of output visualization files.", )
+ help="Directory of output visualization files.",
+ )
parser.add_argument(
"--run_mode",
type=str,
default="paddle",
- help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", )
+ help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)",
+ )
parser.add_argument(
"--device",
type=str,
@@ -71,74 +70,70 @@ def argsparser():
"--use_gpu",
type=ast.literal_eval,
default=False,
- help="Deprecated, please use `--device`.", )
+ help="Deprecated, please use `--device`.",
+ )
parser.add_argument(
"--run_benchmark",
type=ast.literal_eval,
default=False,
- help="Whether to predict a image_file repeatedly for benchmark", )
+ help="Whether to predict a image_file repeatedly for benchmark",
+ )
parser.add_argument(
"--enable_mkldnn",
type=ast.literal_eval,
default=False,
- help="Whether use mkldnn with CPU.", )
+ help="Whether use mkldnn with CPU.",
+ )
parser.add_argument(
"--enable_mkldnn_bfloat16",
type=ast.literal_eval,
default=False,
- help="Whether use mkldnn bfloat16 inference with CPU.", )
- parser.add_argument(
- "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
- parser.add_argument(
- "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
- parser.add_argument(
- "--trt_max_shape",
- type=int,
- default=1280,
- help="max_shape for TensorRT.")
- parser.add_argument(
- "--trt_opt_shape",
- type=int,
- default=640,
- help="opt_shape for TensorRT.")
+ help="Whether use mkldnn bfloat16 inference with CPU.",
+ )
+ parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+ parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+ parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
+ parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
parser.add_argument(
"--trt_calib_mode",
type=bool,
default=False,
- help="If the model is produced by TRT offline quantitative "
- "calibration, trt_calib_mode need to set True.", )
+ help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
+ )
parser.add_argument(
"--save_images",
type=ast.literal_eval,
default=True,
- help="Save visualization image results.", )
- parser.add_argument(
- "--save_mot_txts",
- action="store_true",
- help="Save tracking results (txt).")
+ help="Save visualization image results.",
+ )
+ parser.add_argument("--save_mot_txts", action="store_true", help="Save tracking results (txt).")
parser.add_argument(
"--save_mot_txt_per_img",
action="store_true",
- help="Save tracking results (txt) for each image.", )
+ help="Save tracking results (txt) for each image.",
+ )
parser.add_argument(
"--scaled",
type=bool,
default=False,
- help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 "
- "True in general detector.", )
- parser.add_argument(
- "--tracker_config", type=str, default=None, help=("tracker donfig"))
+ help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.",
+ )
+ parser.add_argument("--tracker_config", type=str, default=None, help=("tracker donfig"))
parser.add_argument(
"--reid_model_dir",
type=str,
default=None,
- help=("Directory include:'model.pdiparams', 'model.pdmodel', "
- "'infer_cfg.yml', created by tools/export_model.py."), )
+ help=(
+ "Directory include:'model.pdiparams', 'model.pdmodel', "
+ "'infer_cfg.yml', created by tools/export_model.py."
+ ),
+ )
parser.add_argument(
"--reid_batch_size",
type=int,
default=50,
- help="max batch_size for reid model inference.", )
+ help="max batch_size for reid model inference.",
+ )
parser.add_argument(
"--use_dark",
type=ast.literal_eval,
@@ -149,27 +144,32 @@ def argsparser():
"--action_file",
type=str,
default=None,
- help="Path of input file for action recognition.", )
+ help="Path of input file for action recognition.",
+ )
parser.add_argument(
"--window_size",
type=int,
default=50,
- help="Temporal size of skeleton feature for action recognition.", )
+ help="Temporal size of skeleton feature for action recognition.",
+ )
parser.add_argument(
"--random_pad",
type=ast.literal_eval,
default=False,
- help="Whether do random padding for action recognition.", )
+ help="Whether do random padding for action recognition.",
+ )
parser.add_argument(
"--save_results",
action="store_true",
default=False,
- help="Whether save detection result to file using coco format", )
+ help="Whether save detection result to file using coco format",
+ )
parser.add_argument(
"--use_coco_category",
action="store_true",
default=False,
- help="Whether to use the coco format dictionary `clsid2catid`", )
+ help="Whether to use the coco format dictionary `clsid2catid`",
+ )
parser.add_argument(
"--slice_infer",
action="store_true",
@@ -180,13 +180,15 @@ def argsparser():
nargs="+",
type=int,
default=[640, 640],
- help="Height of the sliced image.", )
+ help="Height of the sliced image.",
+ )
parser.add_argument(
"--overlap_ratio",
nargs="+",
type=float,
default=[0.25, 0.25],
- help="Overlap height ratio of the sliced image.", )
+ help="Overlap height ratio of the sliced image.",
+ )
parser.add_argument(
"--combine_method",
type=str,
@@ -197,12 +199,14 @@ def argsparser():
"--match_threshold",
type=float,
default=0.6,
- help="Combine method matching threshold.", )
+ help="Combine method matching threshold.",
+ )
parser.add_argument(
"--match_metric",
type=str,
default="ios",
- help="Combine method matching metric, choose in ['iou', 'ios'].", )
+ help="Combine method matching metric, choose in ['iou', 'ios'].",
+ )
return parser
@@ -254,38 +258,34 @@ def info(self, average=False):
total_time = total_time + track_time
total_time = round(total_time, 4)
print("------------------ Inference Time Info ----------------------")
- print("total_time(ms): {}, img_num: {}".format(total_time * 1000,
- self.img_num))
- preprocess_time = (round(pre_time / max(1, self.img_num), 4)
- if average else pre_time)
- postprocess_time = (round(post_time / max(1, self.img_num), 4)
- if average else post_time)
- inference_time = (round(infer_time / max(1, self.img_num), 4)
- if average else infer_time)
- tracking_time = (round(track_time / max(1, self.img_num), 4)
- if average else track_time)
+ print("total_time(ms): {}, img_num: {}".format(total_time * 1000, self.img_num))
+ preprocess_time = round(pre_time / max(1, self.img_num), 4) if average else pre_time
+ postprocess_time = round(post_time / max(1, self.img_num), 4) if average else post_time
+ inference_time = round(infer_time / max(1, self.img_num), 4) if average else infer_time
+ tracking_time = round(track_time / max(1, self.img_num), 4) if average else track_time
average_latency = total_time / max(1, self.img_num)
qps = 0
if total_time > 0:
qps = 1 / average_latency
- print("average latency time(ms): {:.2f}, QPS: {:2f}".format(
- average_latency * 1000, qps))
+ print("average latency time(ms): {:.2f}, QPS: {:2f}".format(average_latency * 1000, qps))
if self.with_tracker:
print(
- "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".
- format(
+ "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".format(
preprocess_time * 1000,
inference_time * 1000,
postprocess_time * 1000,
- tracking_time * 1000, ))
+ tracking_time * 1000,
+ )
+ )
else:
print(
- "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
- format(
+ "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".format(
preprocess_time * 1000,
inference_time * 1000,
- postprocess_time * 1000, ))
+ postprocess_time * 1000,
+ )
+ )
def report(self, average=False):
dic = {}
@@ -294,18 +294,13 @@ def report(self, average=False):
post_time = self.postprocess_time_s.value()
track_time = self.tracking_time_s.value()
- dic["preprocess_time_s"] = (round(pre_time / max(1, self.img_num), 4)
- if average else pre_time)
- dic["inference_time_s"] = (round(infer_time / max(1, self.img_num), 4)
- if average else infer_time)
- dic["postprocess_time_s"] = (round(post_time / max(1, self.img_num), 4)
- if average else post_time)
+ dic["preprocess_time_s"] = round(pre_time / max(1, self.img_num), 4) if average else pre_time
+ dic["inference_time_s"] = round(infer_time / max(1, self.img_num), 4) if average else infer_time
+ dic["postprocess_time_s"] = round(post_time / max(1, self.img_num), 4) if average else post_time
dic["img_num"] = self.img_num
total_time = pre_time + infer_time + post_time
if self.with_tracker:
- dic["tracking_time_s"] = (
- round(track_time / max(1, self.img_num), 4)
- if average else track_time)
+ dic["tracking_time_s"] = round(track_time / max(1, self.img_num), 4) if average else track_time
total_time = total_time + track_time
dic["total_time_s"] = round(total_time, 4)
return dic
@@ -513,10 +508,9 @@ def gaussian_radius(bbox_size, min_overlap):
def gaussian2D(shape, sigma_x=1, sigma_y=1):
m, n = [(ss - 1.0) / 2.0 for ss in shape]
- y, x = np.ogrid[-m:m + 1, -n:n + 1]
+ y, x = np.ogrid[-m : m + 1, -n : n + 1]
- h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
- sigma_y)))
+ h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * sigma_y)))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
@@ -526,8 +520,7 @@ def draw_umich_gaussian(heatmap, center, radius, k=1):
draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
"""
diameter = 2 * radius + 1
- gaussian = gaussian2D(
- (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+ gaussian = gaussian2D((diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
x, y = int(center[0]), int(center[1])
@@ -536,9 +529,8 @@ def draw_umich_gaussian(heatmap, center, radius, k=1):
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
- masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
- masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
- radius + right]
+ masked_heatmap = heatmap[y - top : y + bottom, x - left : x + right]
+ masked_gaussian = gaussian[radius - top : radius + bottom, radius - left : radius + right]
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
return heatmap
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
index 3fdd640c1969b..6ea9f1b4a241b 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
@@ -42,8 +42,7 @@ def visualize_box_mask(im, results, labels, threshold=0.5):
elif isinstance(im, np.ndarray):
im = Image.fromarray(im)
if "masks" in results and "boxes" in results and len(results["boxes"]) > 0:
- im = draw_mask(
- im, results["boxes"], results["masks"], labels, threshold=threshold)
+ im = draw_mask(im, results["boxes"], results["masks"], labels, threshold=threshold)
if "boxes" in results and len(results["boxes"]) > 0:
im = draw_box(im, results["boxes"], labels, threshold=threshold)
if "segm" in results:
@@ -53,7 +52,8 @@ def visualize_box_mask(im, results, labels, threshold=0.5):
results["label"],
results["score"],
labels,
- threshold=threshold, )
+ threshold=threshold,
+ )
return im
@@ -74,7 +74,7 @@ def get_color_map_list(num_classes):
color_map[i * 3 + 2] |= ((lab >> 2) & 1) << (7 - j)
j += 1
lab >>= 3
- color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+ color_map = [color_map[i : i + 3] for i in range(0, len(color_map), 3)]
return color_map
@@ -141,40 +141,31 @@ def draw_box(im, np_boxes, labels, threshold=0.5):
if len(bbox) == 4:
xmin, ymin, xmax, ymax = bbox
- print("class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],"
- "right_bottom:[{:.2f},{:.2f}]".format(
- int(clsid), score, xmin, ymin, xmax, ymax))
+ print(
+ "class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],"
+ "right_bottom:[{:.2f},{:.2f}]".format(int(clsid), score, xmin, ymin, xmax, ymax)
+ )
# draw bbox
draw.line(
- [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
- (xmin, ymin)],
+ [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)],
width=draw_thickness,
- fill=color, )
+ fill=color,
+ )
elif len(bbox) == 8:
x1, y1, x2, y2, x3, y3, x4, y4 = bbox
- draw.line(
- [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
- width=2,
- fill=color)
+ draw.line([(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill=color)
xmin = min(x1, x2, x3, x4)
ymin = min(y1, y2, y3, y4)
# draw label
text = "{} {:.4f}".format(labels[clsid], score)
tw, th = draw.textsize(text)
- draw.rectangle(
- [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+ draw.rectangle([(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
return im
-def draw_segm(im,
- np_segms,
- np_label,
- np_score,
- labels,
- threshold=0.5,
- alpha=0.7):
+def draw_segm(im, np_segms, np_label, np_score, labels, threshold=0.5, alpha=0.7):
"""
Draw segmentation on image
"""
@@ -204,8 +195,7 @@ def draw_segm(im,
sum_y = np.sum(mask, axis=1)
y = np.where(sum_y > 0.5)[0]
x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
- cv2.rectangle(im, (x0, y0), (x1, y1),
- tuple(color_mask.astype("int32").tolist()), 1)
+ cv2.rectangle(im, (x0, y0), (x1, y1), tuple(color_mask.astype("int32").tolist()), 1)
bbox_text = "%s %.2f" % (labels[clsid], score)
t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
cv2.rectangle(
@@ -213,7 +203,8 @@ def draw_segm(im,
(x0, y0),
(x0 + t_size[0], y0 - t_size[1] - 3),
tuple(color_mask.astype("int32").tolist()),
- -1, )
+ -1,
+ )
cv2.putText(
im,
bbox_text,
@@ -222,7 +213,8 @@ def draw_segm(im,
0.3,
(0, 0, 0),
1,
- lineType=cv2.LINE_AA, )
+ lineType=cv2.LINE_AA,
+ )
return Image.fromarray(im.astype("uint8"))
@@ -233,20 +225,20 @@ def get_color(idx):
def visualize_pose(
- imgfile,
- results,
- visual_thresh=0.6,
- save_name="pose.jpg",
- save_dir="output",
- returnimg=False,
- ids=None, ):
+ imgfile,
+ results,
+ visual_thresh=0.6,
+ save_name="pose.jpg",
+ save_dir="output",
+ returnimg=False,
+ ids=None,
+):
try:
import matplotlib.pyplot as plt
plt.switch_backend("agg")
except Exception as e:
- print("Matplotlib not found, please install matplotlib."
- "for example: `pip install matplotlib`.")
+ print("Matplotlib not found, please install matplotlib." "for example: `pip install matplotlib`.")
raise e
skeletons, _ = results["keypoint"]
skeletons = np.array(skeletons)
@@ -323,8 +315,7 @@ def visualize_pose(
bboxs = results["bbox"]
for j, rect in enumerate(bboxs):
xmin, ymin, xmax, ymax = rect
- color = (colors[0] if color_set is None else
- colors[color_set[j] % len(colors)])
+ color = colors[0] if color_set is None else colors[color_set[j] % len(colors)]
cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
canvas = img.copy()
@@ -333,8 +324,7 @@ def visualize_pose(
if skeletons[j][i, 2] < visual_thresh:
continue
if ids is None:
- color = (colors[i] if color_set is None else
- colors[color_set[j] % len(colors)])
+ color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
else:
color = get_color(ids[j])
@@ -343,15 +333,15 @@ def visualize_pose(
tuple(skeletons[j][i, 0:2].astype("int32")),
2,
color,
- thickness=-1, )
+ thickness=-1,
+ )
stickwidth = 2
for i in range(NUM_EDGES):
for j in range(len(skeletons)):
edge = EDGES[i]
- if (skeletons[j][edge[0], 2] < visual_thresh or
- skeletons[j][edge[1], 2] < visual_thresh):
+ if skeletons[j][edge[0], 2] < visual_thresh or skeletons[j][edge[1], 2] < visual_thresh:
continue
cur_canvas = canvas.copy()
@@ -359,22 +349,18 @@ def visualize_pose(
Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
mX = np.mean(X)
mY = np.mean(Y)
- length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
- polygon = cv2.ellipse2Poly((int(mY), int(mX)),
- (int(length / 2), stickwidth),
- int(angle), 0, 360, 1)
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
if ids is None:
- color = (colors[i] if color_set is None else
- colors[color_set[j] % len(colors)])
+ color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
else:
color = get_color(ids[j])
cv2.fillConvexPoly(cur_canvas, polygon, color)
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
if returnimg:
return canvas
- save_name = os.path.join(
- save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg")
+ save_name = os.path.join(save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg")
plt.imsave(save_name, canvas[:, :, ::-1])
print("keypoint visualize image saved to: " + save_name)
plt.close()
@@ -414,5 +400,6 @@ def visualize_attr(im, results, boxes=None, is_mtmct=False):
cv2.FONT_ITALIC,
text_scale,
(0, 255, 255),
- thickness=text_thickness, )
+ thickness=text_thickness,
+ )
return im
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
index 53102b4c87bb4..1284578b851f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
@@ -32,71 +32,80 @@ def parse_args():
parser = argparse.ArgumentParser(description="Model prediction")
# params of prediction
- parser.add_argument(
- "--config", dest="cfg", help="The config file.", default=None, type=str)
+ parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
parser.add_argument(
"--model_path",
dest="model_path",
help="The path of model for prediction",
type=str,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--image_path",
dest="image_path",
help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
type=str,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--save_dir",
dest="save_dir",
help="The directory for saving the predicted results",
type=str,
- default="./output/result", )
+ default="./output/result",
+ )
# augment for prediction
parser.add_argument(
"--aug_pred",
dest="aug_pred",
help="Whether to use mulit-scales and flip augment for prediction",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--scales",
dest="scales",
nargs="+",
help="Scales for augment",
type=float,
- default=1.0, )
+ default=1.0,
+ )
parser.add_argument(
"--flip_horizontal",
dest="flip_horizontal",
help="Whether to use flip horizontally augment",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--flip_vertical",
dest="flip_vertical",
help="Whether to use flip vertically augment",
- action="store_true", )
+ action="store_true",
+ )
# sliding window prediction
parser.add_argument(
"--is_slide",
dest="is_slide",
help="Whether to prediction by sliding window",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--crop_size",
dest="crop_size",
nargs=2,
help="The crop size of sliding window, the first is width and the second is height.",
type=int,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--stride",
dest="stride",
nargs=2,
help="The stride of sliding window, the first is width and the second is height.",
type=int,
- default=None, )
+ default=None,
+ )
# custom color map
parser.add_argument(
@@ -105,7 +114,8 @@ def parse_args():
nargs="+",
help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
type=int,
- default=None, )
+ default=None,
+ )
# set device
parser.add_argument(
@@ -113,7 +123,8 @@ def parse_args():
dest="device",
help="Device place to be set, which can be GPU, XPU, NPU, CPU",
default="gpu",
- type=str, )
+ type=str,
+ )
return parser.parse_args()
@@ -301,8 +312,7 @@ def get_test_config(cfg, args):
def main(args):
env_info = get_sys_env()
- if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and
- env_info["GPUs used"]):
+ if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
place = "gpu"
elif args.device == "xpu" and paddle.is_compiled_with_xpu():
place = "xpu"
@@ -337,10 +347,13 @@ def main(args):
image_list=image_list,
image_dir=image_dir,
save_dir=args.save_dir,
- **test_config, )
+ **test_config,
+ )
-checkpoint_file = "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams"
+checkpoint_file = (
+ "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams"
+)
class SegformerDetector:
@@ -350,27 +363,21 @@ def __init__(self, mode):
"ade20k",
], f"mode should in {['cityscapes', 'ade20k']}!"
if mode == "cityscapes":
- segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
- "segformer_model")
- modelpath = os.path.join(segformer_annotator_ckpts_path,
- "model.pdparams")
+ segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
+ modelpath = os.path.join(segformer_annotator_ckpts_path, "model.pdparams")
if not os.path.exists(modelpath):
- from paddlenlp.utils.downloader import \
- get_path_from_url_with_filelock
+ from paddlenlp.utils.downloader import get_path_from_url_with_filelock
- get_path_from_url_with_filelock(
- checkpoint_file, root_dir=segformer_annotator_ckpts_path)
+ get_path_from_url_with_filelock(checkpoint_file, root_dir=segformer_annotator_ckpts_path)
self.model_path = modelpath
- cfg = (
- "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml"
- )
+ cfg = "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml"
else:
- segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
- "segformer_model")
+ segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
modelpath = os.path.join(
segformer_annotator_ckpts_path,
- "segformer_b5_ade20k_512x512_160k.pdparams", )
+ "segformer_b5_ade20k_512x512_160k.pdparams",
+ )
self.model_path = modelpath
@@ -404,9 +411,9 @@ def __call__(self, img):
save_dir="output",
skip_save=True,
custom_color=custom_color_flatten,
- **self.test_config, )
- pred_mask = cv2.cvtColor(
- np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
+ **self.test_config,
+ )
+ pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
return pred_mask
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
index 6077f36175759..5e1850259a3f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
+++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
@@ -33,7 +33,7 @@ def mkdir(path):
def partition_list(arr, m):
"""split the list 'arr' into m pieces"""
n = int(math.ceil(len(arr) / float(m)))
- return [arr[i:i + n] for i in range(0, len(arr), n)]
+ return [arr[i : i + n] for i in range(0, len(arr), n)]
def preprocess(im_path, transforms):
@@ -47,20 +47,21 @@ def preprocess(im_path, transforms):
def predict(
- model,
- model_path,
- transforms,
- image_list,
- image_dir=None,
- save_dir="output",
- aug_pred=False,
- scales=1.0,
- flip_horizontal=True,
- flip_vertical=False,
- is_slide=False,
- stride=None,
- crop_size=None,
- custom_color=None, ):
+ model,
+ model_path,
+ transforms,
+ image_list,
+ image_dir=None,
+ save_dir="output",
+ aug_pred=False,
+ scales=1.0,
+ flip_horizontal=True,
+ flip_vertical=False,
+ is_slide=False,
+ stride=None,
+ crop_size=None,
+ custom_color=None,
+):
"""
predict and visualize the image_list.
@@ -112,7 +113,8 @@ def predict(
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
else:
pred, _ = infer.inference(
model,
@@ -120,7 +122,8 @@ def predict(
trans_info=data["trans_info"],
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
pred = paddle.squeeze(pred)
pred = pred.numpy().astype("uint8")
@@ -133,16 +136,14 @@ def predict(
im_file = im_file[1:]
# save added image
- added_image = utils.visualize.visualize(
- im_path, pred, color_map, weight=0.6)
+ added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
added_image_path = os.path.join(added_saved_dir, im_file)
mkdir(added_image_path)
cv2.imwrite(added_image_path, added_image)
# save pseudo color prediction
pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
- pred_saved_path = os.path.join(
- pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+ pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
mkdir(pred_saved_path)
pred_mask.save(pred_saved_path)
@@ -151,21 +152,22 @@ def predict(
def quick_predict(
- model,
- model_path,
- transforms,
- image_list,
- image_dir=None,
- save_dir="output",
- aug_pred=False,
- scales=1.0,
- flip_horizontal=True,
- flip_vertical=False,
- is_slide=False,
- stride=None,
- crop_size=None,
- custom_color=None,
- skip_save=True, ):
+ model,
+ model_path,
+ transforms,
+ image_list,
+ image_dir=None,
+ save_dir="output",
+ aug_pred=False,
+ scales=1.0,
+ flip_horizontal=True,
+ flip_vertical=False,
+ is_slide=False,
+ stride=None,
+ crop_size=None,
+ custom_color=None,
+ skip_save=True,
+):
"""
predict and visualize the image_list.
@@ -218,7 +220,8 @@ def quick_predict(
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
else:
pred, _ = infer.inference(
model,
@@ -226,7 +229,8 @@ def quick_predict(
trans_info=data["trans_info"],
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
pred = paddle.squeeze(pred)
pred = pred.numpy().astype("uint8")
@@ -241,8 +245,7 @@ def quick_predict(
# save added image
if not skip_save:
- added_image = utils.visualize.visualize(
- im_path, pred, color_map, weight=0.6)
+ added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
added_image_path = os.path.join(added_saved_dir, im_file)
mkdir(added_image_path)
cv2.imwrite(added_image_path, added_image)
@@ -250,8 +253,7 @@ def quick_predict(
# save pseudo color prediction
pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
if not skip_save:
- pred_saved_path = os.path.join(
- pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+ pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
mkdir(pred_saved_path)
pred_mask.save(pred_saved_path)
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
index 76919bda8b88c..5d041d259a4ad 100644
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
@@ -32,71 +32,80 @@ def parse_args():
parser = argparse.ArgumentParser(description="Model prediction")
# params of prediction
- parser.add_argument(
- "--config", dest="cfg", help="The config file.", default=None, type=str)
+ parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
parser.add_argument(
"--model_path",
dest="model_path",
help="The path of model for prediction",
type=str,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--image_path",
dest="image_path",
help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
type=str,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--save_dir",
dest="save_dir",
help="The directory for saving the predicted results",
type=str,
- default="./output/result", )
+ default="./output/result",
+ )
# augment for prediction
parser.add_argument(
"--aug_pred",
dest="aug_pred",
help="Whether to use mulit-scales and flip augment for prediction",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--scales",
dest="scales",
nargs="+",
help="Scales for augment",
type=float,
- default=1.0, )
+ default=1.0,
+ )
parser.add_argument(
"--flip_horizontal",
dest="flip_horizontal",
help="Whether to use flip horizontally augment",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--flip_vertical",
dest="flip_vertical",
help="Whether to use flip vertically augment",
- action="store_true", )
+ action="store_true",
+ )
# sliding window prediction
parser.add_argument(
"--is_slide",
dest="is_slide",
help="Whether to prediction by sliding window",
- action="store_true", )
+ action="store_true",
+ )
parser.add_argument(
"--crop_size",
dest="crop_size",
nargs=2,
help="The crop size of sliding window, the first is width and the second is height.",
type=int,
- default=None, )
+ default=None,
+ )
parser.add_argument(
"--stride",
dest="stride",
nargs=2,
help="The stride of sliding window, the first is width and the second is height.",
type=int,
- default=None, )
+ default=None,
+ )
# custom color map
parser.add_argument(
@@ -105,7 +114,8 @@ def parse_args():
nargs="+",
help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
type=int,
- default=None, )
+ default=None,
+ )
# set device
parser.add_argument(
@@ -113,7 +123,8 @@ def parse_args():
dest="device",
help="Device place to be set, which can be GPU, XPU, NPU, CPU",
default="gpu",
- type=str, )
+ type=str,
+ )
return parser.parse_args()
@@ -301,8 +312,7 @@ def get_test_config(cfg, args):
def main(args):
env_info = get_sys_env()
- if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and
- env_info["GPUs used"]):
+ if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
place = "gpu"
elif args.device == "xpu" and paddle.is_compiled_with_xpu():
place = "xpu"
@@ -337,24 +347,23 @@ def main(args):
image_list=image_list,
image_dir=image_dir,
save_dir=args.save_dir,
- **test_config, )
+ **test_config,
+ )
-checkpoint_file = "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams"
+checkpoint_file = (
+ "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams"
+)
class SegmenterDetector:
def __init__(self):
- segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
- "segmenter_model")
- modelpath = os.path.join(segmenter_annotator_ckpts_path,
- "model.pdparams")
+ segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segmenter_model")
+ modelpath = os.path.join(segmenter_annotator_ckpts_path, "model.pdparams")
if not os.path.exists(modelpath):
- from paddlenlp.utils.downloader import \
- get_path_from_url_with_filelock
+ from paddlenlp.utils.downloader import get_path_from_url_with_filelock
- get_path_from_url_with_filelock(
- checkpoint_file, root_dir=segmenter_annotator_ckpts_path)
+ get_path_from_url_with_filelock(checkpoint_file, root_dir=segmenter_annotator_ckpts_path)
self.model_path = modelpath
cfg = "annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml"
@@ -385,9 +394,9 @@ def __call__(self, img):
save_dir="output",
skip_save=True,
custom_color=custom_color_flatten,
- **self.test_config, )
- pred_mask = cv2.cvtColor(
- np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
+ **self.test_config,
+ )
+ pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
return pred_mask
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
index 6077f36175759..5e1850259a3f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
+++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
@@ -33,7 +33,7 @@ def mkdir(path):
def partition_list(arr, m):
"""split the list 'arr' into m pieces"""
n = int(math.ceil(len(arr) / float(m)))
- return [arr[i:i + n] for i in range(0, len(arr), n)]
+ return [arr[i : i + n] for i in range(0, len(arr), n)]
def preprocess(im_path, transforms):
@@ -47,20 +47,21 @@ def preprocess(im_path, transforms):
def predict(
- model,
- model_path,
- transforms,
- image_list,
- image_dir=None,
- save_dir="output",
- aug_pred=False,
- scales=1.0,
- flip_horizontal=True,
- flip_vertical=False,
- is_slide=False,
- stride=None,
- crop_size=None,
- custom_color=None, ):
+ model,
+ model_path,
+ transforms,
+ image_list,
+ image_dir=None,
+ save_dir="output",
+ aug_pred=False,
+ scales=1.0,
+ flip_horizontal=True,
+ flip_vertical=False,
+ is_slide=False,
+ stride=None,
+ crop_size=None,
+ custom_color=None,
+):
"""
predict and visualize the image_list.
@@ -112,7 +113,8 @@ def predict(
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
else:
pred, _ = infer.inference(
model,
@@ -120,7 +122,8 @@ def predict(
trans_info=data["trans_info"],
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
pred = paddle.squeeze(pred)
pred = pred.numpy().astype("uint8")
@@ -133,16 +136,14 @@ def predict(
im_file = im_file[1:]
# save added image
- added_image = utils.visualize.visualize(
- im_path, pred, color_map, weight=0.6)
+ added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
added_image_path = os.path.join(added_saved_dir, im_file)
mkdir(added_image_path)
cv2.imwrite(added_image_path, added_image)
# save pseudo color prediction
pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
- pred_saved_path = os.path.join(
- pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+ pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
mkdir(pred_saved_path)
pred_mask.save(pred_saved_path)
@@ -151,21 +152,22 @@ def predict(
def quick_predict(
- model,
- model_path,
- transforms,
- image_list,
- image_dir=None,
- save_dir="output",
- aug_pred=False,
- scales=1.0,
- flip_horizontal=True,
- flip_vertical=False,
- is_slide=False,
- stride=None,
- crop_size=None,
- custom_color=None,
- skip_save=True, ):
+ model,
+ model_path,
+ transforms,
+ image_list,
+ image_dir=None,
+ save_dir="output",
+ aug_pred=False,
+ scales=1.0,
+ flip_horizontal=True,
+ flip_vertical=False,
+ is_slide=False,
+ stride=None,
+ crop_size=None,
+ custom_color=None,
+ skip_save=True,
+):
"""
predict and visualize the image_list.
@@ -218,7 +220,8 @@ def quick_predict(
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
else:
pred, _ = infer.inference(
model,
@@ -226,7 +229,8 @@ def quick_predict(
trans_info=data["trans_info"],
is_slide=is_slide,
stride=stride,
- crop_size=crop_size, )
+ crop_size=crop_size,
+ )
pred = paddle.squeeze(pred)
pred = pred.numpy().astype("uint8")
@@ -241,8 +245,7 @@ def quick_predict(
# save added image
if not skip_save:
- added_image = utils.visualize.visualize(
- im_path, pred, color_map, weight=0.6)
+ added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
added_image_path = os.path.join(added_saved_dir, im_file)
mkdir(added_image_path)
cv2.imwrite(added_image_path, added_image)
@@ -250,8 +253,7 @@ def quick_predict(
# save pseudo color prediction
pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
if not skip_save:
- pred_saved_path = os.path.join(
- pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+ pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
mkdir(pred_saved_path)
pred_mask.save(pred_saved_path)
diff --git a/ppdiffusers/examples/controlnet/annotator/util.py b/ppdiffusers/examples/controlnet/annotator/util.py
index 069005f683d59..7231c67ac5507 100644
--- a/ppdiffusers/examples/controlnet/annotator/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/util.py
@@ -53,16 +53,15 @@ def resize_image(input_image, resolution):
img = cv2.resize(
input_image,
(W, H),
- interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, )
+ interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+ )
return img
def make_noise_disk(H, W, C, F):
- noise = np.random.uniform(
- low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
- noise = cv2.resize(
- noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
- noise = noise[F:F + H, F:F + W]
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
+ noise = noise[F : F + H, F : F + W]
noise -= np.min(noise)
noise /= np.max(noise)
if C == 1:
diff --git a/ppdiffusers/examples/controlnet/control/control_args.py b/ppdiffusers/examples/controlnet/control/control_args.py
index 82e5c32ab1181..6a688687e1a27 100644
--- a/ppdiffusers/examples/controlnet/control/control_args.py
+++ b/ppdiffusers/examples/controlnet/control/control_args.py
@@ -22,44 +22,28 @@ class ModelArguments:
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
- vae_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "pretrained_vae_name_or_path"})
- text_encoder_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "text_encoder_name_or_path"})
- unet_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "unet_encoder_name_or_path"})
+ vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
+ text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+ unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
tokenizer_name: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not the same as model_name"
- }, )
- model_max_length: Optional[int] = field(
- default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
- num_inference_steps: Optional[int] = field(
- default=50, metadata={"help": "num_inference_steps"})
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+ )
+ model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+ num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
pretrained_model_name_or_path: str = field(
default="runwayml/stable-diffusion-v1-5",
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
- image_logging_steps: Optional[int] = field(
- default=1000, metadata={"help": "Log image every X steps."})
- sd_locked: bool = field(
- default=True, metadata={"help": "lock unet output_blocks and out."})
- use_paddle_conv_init: bool = field(
- default=False,
- metadata={"help": "Whether or not use paddle conv2d init."})
- only_mid_control: bool = field(
- default=False, metadata={"help": "only_mid_control."})
- is_ldmbert: bool = field(
- default=False, metadata={"help": "Whether to use ldmbert."})
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
+ image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
+ sd_locked: bool = field(default=True, metadata={"help": "lock unet output_blocks and out."})
+ use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
+ only_mid_control: bool = field(default=False, metadata={"help": "only_mid_control."})
+ is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable_xformers_memory_efficient_attention."})
+ default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+ )
@dataclass
@@ -71,8 +55,7 @@ class DataArguments:
resolution: int = field(
default=512,
metadata={
- "help":
- "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
- }, )
- file_path: str = field(
- default="./fill50k", metadata={"help": "The path to of the fill50k."})
+ "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+ },
+ )
+ file_path: str = field(default="./fill50k", metadata={"help": "The path to of the fill50k."})
diff --git a/ppdiffusers/examples/controlnet/control/control_trainer.py b/ppdiffusers/examples/controlnet/control/control_trainer.py
index 506dfc88664cb..0b40903ded378 100644
--- a/ppdiffusers/examples/controlnet/control/control_trainer.py
+++ b/ppdiffusers/examples/controlnet/control/control_trainer.py
@@ -18,8 +18,11 @@
import paddle.amp.auto_cast as autocast
from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
- VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+ INTEGRATION_TO_CALLBACK,
+ VisualDLCallback,
+ rewrite_logs,
+)
from paddlenlp.utils.log import logger
from ppdiffusers.training_utils import unwrap_model
@@ -36,19 +39,17 @@ def autocast_smart_context_manager(self, args):
"c_softmax_with_cross_entropy",
],
level=args.fp16_opt_level,
- dtype=amp_dtype, )
+ dtype=amp_dtype,
+ )
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
return ctx_manager
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def on_log(self, args, state, control, logs=None, **kwargs):
@@ -58,20 +59,22 @@ def on_log(self, args, state, control, logs=None, **kwargs):
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
with self.autocast_smart_context_manager(args):
- image_logs["reconstruction"] = model.decode_image(
- pixel_values=inputs["pixel_values"])
- image_logs["control"] = model.decode_control_image(
- controlnet_cond=inputs["controlnet_cond"])
+ image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
+ image_logs["control"] = model.decode_control_image(controlnet_cond=inputs["controlnet_cond"])
image_logs["ddim-samples-9.0"] = model.log_image(
input_ids=inputs["input_ids"],
controlnet_cond=inputs["controlnet_cond"],
guidance_scale=9.0,
height=args.resolution,
- width=args.resolution, )
+ width=args.resolution,
+ )
if self.vdl_writer is None:
self._init_summary_writer(args)
@@ -86,11 +89,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
@@ -103,14 +106,11 @@ def compute_loss(self, model, inputs, return_outputs=False):
loss = model(**inputs)
return loss
- def _save(self,
- output_dir=None,
- state_dict=None,
- merge_tensor_parallel=False):
+ def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
super()._save(
output_dir=output_dir,
state_dict=state_dict,
- merge_tensor_parallel=merge_tensor_parallel, )
+ merge_tensor_parallel=merge_tensor_parallel,
+ )
output_dir = output_dir if output_dir is not None else self.args.output_dir
- unwrap_model(self.model).controlnet.save_pretrained(
- os.path.join(output_dir, "controlnet"))
+ unwrap_model(self.model).controlnet.save_pretrained(os.path.join(output_dir, "controlnet"))
diff --git a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
index 78c3c2bfdbf84..c67eca10fb034 100644
--- a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
+++ b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
@@ -35,7 +35,8 @@ def __init__(self, tokenizer, file_path="./fill50k"):
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="np", ).input_ids[0]
+ return_tensors="np",
+ ).input_ids[0]
def __len__(self):
return len(self.data)
@@ -63,9 +64,7 @@ def __getitem__(self, idx):
input_ids = self.text_processing(prompt)
return dict(
- input_ids=paddle.to_tensor(
- input_ids, dtype=paddle.int64),
- pixel_values=paddle.to_tensor(
- target.transpose([2, 0, 1]), dtype=paddle.float32),
- controlnet_cond=paddle.to_tensor(
- source.transpose([2, 0, 1]), dtype=paddle.float32), )
+ input_ids=paddle.to_tensor(input_ids, dtype=paddle.int64),
+ pixel_values=paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32),
+ controlnet_cond=paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32),
+ )
diff --git a/ppdiffusers/examples/controlnet/control/model.py b/ppdiffusers/examples/controlnet/control/model.py
index de2bfb4ee5d47..c0d86532d5021 100644
--- a/ppdiffusers/examples/controlnet/control/model.py
+++ b/ppdiffusers/examples/controlnet/control/model.py
@@ -22,9 +22,15 @@
from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
from paddlenlp.utils.log import logger
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
- DDPMScheduler, LDMBertModel, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ DDPMScheduler,
+ LDMBertModel,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.initializer import reset_initialized_parameter
from ppdiffusers.models.ema import LitEma
from ppdiffusers.training_utils import freeze_params
@@ -42,18 +48,20 @@ def __init__(self, model_args):
# init tokenizer
tokenizer_name_or_path = (
model_args.tokenizer_name
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+ )
self.tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name_or_path,
- model_max_length=model_args.model_max_length)
+ tokenizer_name_or_path, model_max_length=model_args.model_max_length
+ )
vae_name = "vqvae" if model_args.is_ldmbert else "vae"
# init vae
vae_name_or_path = (
model_args.vae_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, vae_name))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
+ )
self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
freeze_params(self.vae.parameters())
@@ -62,55 +70,54 @@ def __init__(self, model_args):
if model_args.is_ldmbert:
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "bert"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "bert")
+ )
# init text_encoder
- self.text_encoder = LDMBertModel.from_pretrained(
- text_encoder_name_or_path)
+ self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
else:
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path,
- "text_encoder"))
- self.text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_name_or_path)
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+ )
+ self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
freeze_params(self.text_encoder.parameters())
logger.info("Freeze text_encoder parameters!")
unet_name_or_path = (
model_args.unet_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+ )
self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
freeze_params(self.unet.parameters())
logger.info("Freeze unet parameters!")
- self.controlnet = ControlNetModel.from_unet(
- self.unet, load_weights_from_unet=True)
+ self.controlnet = ControlNetModel.from_unet(self.unet, load_weights_from_unet=True)
if not model_args.use_paddle_conv_init:
# use torch conv2d init
- reset_initialized_parameter(
- self.controlnet.controlnet_cond_embedding.conv_in)
- reset_initialized_parameter(
- self.controlnet.controlnet_cond_embedding.blocks)
+ reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.conv_in)
+ reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.blocks)
self.noise_scheduler = DDPMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
self.eval_scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
self.use_ema = model_args.use_ema
if self.use_ema:
@@ -118,15 +125,15 @@ def __init__(self, model_args):
self.control_scales = [1.0] * 13
self.only_mid_control = model_args.only_mid_control
- if (model_args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
self.unet.enable_xformers_memory_efficient_attention()
self.controlnet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
@contextlib.contextmanager
def ema_scope(self, context=None):
@@ -147,11 +154,7 @@ def on_train_batch_end(self):
if self.use_ema:
self.model_ema(self.controlnet)
- def forward(self,
- input_ids=None,
- pixel_values=None,
- controlnet_cond=None,
- **kwargs):
+ def forward(self, input_ids=None, pixel_values=None, controlnet_cond=None, **kwargs):
self.train()
with paddle.amp.auto_cast(enable=False):
with paddle.no_grad():
@@ -160,11 +163,10 @@ def forward(self,
latents = self.vae.encode(pixel_values).latent_dist.sample()
latents = latents * 0.18215
noise = paddle.randn(latents.shape)
- timesteps = paddle.randint(
- 0, self.noise_scheduler.num_train_timesteps,
- (latents.shape[0], )).astype("int64")
- noisy_latents = self.noise_scheduler.add_noise(latents, noise,
- timesteps)
+ timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
+ "int64"
+ )
+ noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
encoder_hidden_states = self.text_encoder(input_ids)[0]
# control
down_block_res_samples, mid_block_res_sample = self.controlnet(
@@ -173,7 +175,8 @@ def forward(self,
encoder_hidden_states=encoder_hidden_states,
controlnet_cond=controlnet_cond,
conditioning_scale=self.control_scales,
- return_dict=False, )
+ return_dict=False,
+ )
# predict the noise residual
noise_pred = self.unet(
@@ -181,7 +184,8 @@ def forward(self,
timestep=timesteps,
encoder_hidden_states=encoder_hidden_states,
down_block_additional_residuals=down_block_res_samples,
- mid_block_additional_residual=mid_block_res_sample, ).sample
+ mid_block_additional_residual=mid_block_res_sample,
+ ).sample
loss = F.mse_loss(noise_pred, noise, reduction="mean")
return loss
@@ -198,25 +202,23 @@ def decode_image(self, pixel_values=None, **kwargs):
@paddle.no_grad()
def decode_control_image(self, controlnet_cond=None, **kwargs):
- return ((255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32")
- .numpy().round())
+ return (255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
@paddle.no_grad()
def log_image(
- self,
- input_ids=None,
- controlnet_cond=None,
- height=512,
- width=512,
- eta=0.0,
- guidance_scale=7.5,
- **kwargs, ):
+ self,
+ input_ids=None,
+ controlnet_cond=None,
+ height=512,
+ width=512,
+ eta=0.0,
+ guidance_scale=7.5,
+ **kwargs,
+ ):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log 8 image
if input_ids.shape[0] > 4:
input_ids = input_ids[:4]
@@ -230,34 +232,30 @@ def log_image(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings], axis=0)
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
- latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
- height // 8, width // 8))
+ latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
# ddim donot use this
latents = latents * self.eval_scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
- controlnet_cond_input = (paddle.concat([controlnet_cond] * 2)
- if do_classifier_free_guidance else
- controlnet_cond)
+ controlnet_cond_input = (
+ paddle.concat([controlnet_cond] * 2) if do_classifier_free_guidance else controlnet_cond
+ )
for t in self.eval_scheduler.timesteps:
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# ddim donot use this
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
# ControlNet predict the noise residual
down_block_res_samples, mid_block_res_sample = self.controlnet(
@@ -266,7 +264,8 @@ def log_image(
encoder_hidden_states=text_embeddings,
controlnet_cond=controlnet_cond_input,
conditioning_scale=self.control_scales,
- return_dict=False, )
+ return_dict=False,
+ )
# predict the noise residual
noise_pred = self.unet(
@@ -274,17 +273,16 @@ def log_image(
t,
encoder_hidden_states=text_embeddings,
down_block_additional_residuals=down_block_res_samples,
- mid_block_additional_residual=mid_block_res_sample, ).sample
+ mid_block_additional_residual=mid_block_res_sample,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = 1 / 0.18215 * latents
image = self.vae.decode(latents).sample
@@ -296,7 +294,6 @@ def set_recompute(self, value=False):
def fn(layer):
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
self.controlnet.apply(fn)
diff --git a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
index 0cb439f90dd2b..17582dd93e648 100644
--- a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
+++ b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
@@ -24,13 +24,11 @@ def extract_controlnet_ema_weights(model_path, output_path):
for k in state_dict.keys():
if k.startswith("controlnet."):
flat_ema_key = "model_ema." + "".join(k.split(".")[1:])
- ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(
- flat_ema_key)
+ ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(flat_ema_key)
if len(ema_state_dict) == 0:
raise ValueError("Can not extract ema weights!")
os.makedirs(output_path, exist_ok=True)
- paddle.save(ema_state_dict,
- os.path.join(output_path, "model_state.ema.pdparams"))
+ paddle.save(ema_state_dict, os.path.join(output_path, "model_state.ema.pdparams"))
print(f"Save EMA weights to {output_path} !")
@@ -40,11 +38,13 @@ def extract_controlnet_ema_weights(model_path, output_path):
"--model_path",
type=str,
default="./model_state.pdparams",
- help="model_state.", )
+ help="model_state.",
+ )
parser.add_argument(
"--output_path",
type=str,
default="ema_controlnet",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
extract_controlnet_ema_weights(args.model_path, args.output_path)
diff --git a/ppdiffusers/examples/controlnet/gradio_canny2image.py b/ppdiffusers/examples/controlnet/gradio_canny2image.py
index 5dc43a6ca4f8e..5c0ad9e936299 100644
--- a/ppdiffusers/examples/controlnet/gradio_canny2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_canny2image.py
@@ -27,39 +27,37 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta,
- low_threshold,
- high_threshold, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+ low_threshold,
+ high_threshold,
+):
with paddle.no_grad():
img = resize_image(HWC3(input_image), image_resolution)
H, W, C = img.shape
detected_map = apply_canny(img, low_threshold, high_threshold)
detected_map = HWC3(detected_map)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -75,7 +73,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [255 - detected_map] + results
@@ -91,59 +90,55 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
low_threshold = gr.Slider(
label="Canny low threshold",
minimum=1,
maximum=255,
value=100,
- step=1, )
+ step=1,
+ )
high_threshold = gr.Slider(
label="Canny high threshold",
minimum=1,
maximum=255,
value=200,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_depth2image.py b/ppdiffusers/examples/controlnet/gradio_depth2image.py
index 63b50704b9bff..67f33684cc947 100644
--- a/ppdiffusers/examples/controlnet/gradio_depth2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_depth2image.py
@@ -28,37 +28,34 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
- detected_map, _ = apply_midas(
- resize_image(input_image, detect_resolution))
+ detected_map, _ = apply_midas(resize_image(input_image, detect_resolution))
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
if seed == -1:
@@ -75,7 +72,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=1.0,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -91,53 +89,48 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="Depth Resolution",
minimum=128,
maximum=1024,
value=384,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_hed2image.py b/ppdiffusers/examples/controlnet/gradio_hed2image.py
index 87e37dccb3043..9394f85ba697d 100644
--- a/ppdiffusers/examples/controlnet/gradio_hed2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_hed2image.py
@@ -28,25 +28,25 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
detected_map = apply_hed(resize_image(input_image, detect_resolution))
@@ -54,16 +54,13 @@ def process(
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -79,7 +76,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -95,53 +93,42 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
- detect_resolution = gr.Slider(
- label="HED Resolution",
- minimum=128,
- maximum=1024,
- value=512,
- step=1)
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1)
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_hough2image.py b/ppdiffusers/examples/controlnet/gradio_hough2image.py
index eef44cc32f7b0..65ff6c1410769 100644
--- a/ppdiffusers/examples/controlnet/gradio_hough2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_hough2image.py
@@ -28,46 +28,44 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta,
- value_threshold,
- distance_threshold, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+ value_threshold,
+ distance_threshold,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
detected_map = apply_mlsd(
resize_image(input_image, detect_resolution),
value_threshold,
- distance_threshold, )
+ distance_threshold,
+ )
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -83,7 +81,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -99,65 +98,62 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="Hough Line Resolution",
minimum=128,
maximum=1024,
value=512,
- step=1, )
+ step=1,
+ )
value_threshold = gr.Slider(
label="Hough value threshold (MLSD)",
minimum=0.01,
maximum=2.0,
value=0.1,
- step=0.01, )
+ step=0.01,
+ )
distance_threshold = gr.Slider(
label="Hough distance threshold (MLSD)",
minimum=0.01,
maximum=20.0,
value=0.1,
- step=0.01, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=0.01,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
index 0d23830f2c4be..7f164b57c63be 100644
--- a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
@@ -23,41 +23,37 @@
from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/control_v11e_sd15_ip2p")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_ip2p")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
img = resize_image(HWC3(input_image), image_resolution)
detected_map = input_image.copy()
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -73,7 +69,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -89,47 +86,41 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_normal2image.py b/ppdiffusers/examples/controlnet/gradio_normal2image.py
index 6ce2e56d8ea3c..69bf238fe4521 100644
--- a/ppdiffusers/examples/controlnet/gradio_normal2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_normal2image.py
@@ -28,43 +28,39 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta,
- bg_threshold, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+ bg_threshold,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
- _, detected_map = apply_midas(
- resize_image(input_image, detect_resolution), bg_th=bg_threshold)
+ _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold)
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -96,59 +93,55 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="Normal Resolution",
minimum=128,
maximum=1024,
value=384,
- step=1, )
+ step=1,
+ )
bg_threshold = gr.Slider(
label="Normal background threshold",
minimum=0.0,
maximum=1.0,
value=0.4,
- step=0.01, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=0.01,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
index 07a52bcf286d3..e932854042b60 100644
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
+++ b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
@@ -26,45 +26,41 @@
apply_openpose = OpenposePaddleDetector()
-controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-openpose")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- hand,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ hand,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
detected_map, _ = apply_openpose(input_image, detect_resolution, hand)
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -97,53 +94,48 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="OpenPose Resolution",
minimum=128,
maximum=1024,
value=512,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
hand,
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
index 51a713db003db..097bbd83516d3 100644
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
+++ b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
@@ -26,45 +26,41 @@
apply_ppdetpose = PPDetDetector()
-controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-openpose")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- hand,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ hand,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
detected_map, _ = apply_ppdetpose(input_image, detect_resolution, hand)
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -97,53 +94,48 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="OpenPose Resolution",
minimum=128,
maximum=1024,
value=512,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
hand,
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
index 0d89c0899ecb4..1e8bd335f71a5 100644
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
+++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
@@ -28,42 +28,38 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
- detected_map = apply_uniformer(
- resize_image(input_image, detect_resolution))
+ detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -79,7 +75,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -95,53 +92,48 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="Segmentation Resolution",
minimum=128,
maximum=1024,
value=512,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
index b517ba3b94cc4..a99e82a4ea7e5 100644
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
+++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
@@ -28,42 +28,38 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- detect_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ detect_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
input_image = HWC3(input_image)
- detected_map = apply_uniformer(
- resize_image(input_image, detect_resolution))
+ detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
- detected_map = cv2.resize(
- detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = (
- [strength * (0.825**float(12 - i)) for i in range(13)]
- if guess_mode else ([strength] * 13)
+ [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
if seed == -1:
seed = random.randint(0, 65535)
@@ -79,7 +75,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -95,53 +92,48 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
detect_resolution = gr.Slider(
label="Segmentation Resolution",
minimum=128,
maximum=1024,
value=512,
- step=1, )
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ step=1,
+ )
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
index da5f05c890081..0e6313d0d407c 100644
--- a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
@@ -25,34 +25,32 @@
apply_shuffle = ContentShuffleDetector()
-controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/control_v11e_sd15_shuffle")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
def process(
- input_image,
- prompt,
- a_prompt,
- n_prompt,
- num_samples,
- image_resolution,
- ddim_steps,
- guess_mode,
- strength,
- scale,
- seed,
- eta, ):
+ input_image,
+ prompt,
+ a_prompt,
+ n_prompt,
+ num_samples,
+ image_resolution,
+ ddim_steps,
+ guess_mode,
+ strength,
+ scale,
+ seed,
+ eta,
+):
with paddle.no_grad():
img = resize_image(HWC3(input_image), image_resolution)
H, W, C = img.shape
detected_map = apply_shuffle(img, w=W, h=H, f=256)
- control = paddle.to_tensor(
- detected_map.copy(), dtype=paddle.float32) / 255.0
+ control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
control = control.unsqueeze(0).transpose([0, 3, 1, 2])
control_scales = [strength] * 13
@@ -70,7 +68,8 @@ def process(
width=W,
eta=eta,
controlnet_conditioning_scale=control_scales,
- guidance_scale=scale, ).images[0]
+ guidance_scale=scale,
+ ).images[0]
results.append(img)
return [detected_map] + results
@@ -86,47 +85,41 @@ def process(
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
- num_samples = gr.Slider(
- label="Images", minimum=1, maximum=12, value=1, step=1)
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
image_resolution = gr.Slider(
label="Image Resolution",
minimum=256,
maximum=768,
value=512,
- step=64, )
+ step=64,
+ )
strength = gr.Slider(
label="Control Strength",
minimum=0.0,
maximum=2.0,
value=1.0,
- step=0.01, )
+ step=0.01,
+ )
guess_mode = gr.Checkbox(label="Guess Mode", value=False)
- ddim_steps = gr.Slider(
- label="Steps", minimum=1, maximum=100, value=20, step=1)
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
scale = gr.Slider(
label="Guidance Scale",
minimum=0.1,
maximum=30.0,
value=9.0,
- step=0.1, )
- seed = gr.Slider(
- label="Seed",
- minimum=-1,
- maximum=2147483647,
- step=1,
- randomize=True)
+ step=0.1,
+ )
+ seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
eta = gr.Number(label="eta (DDIM)", value=0.0)
- a_prompt = gr.Textbox(
- label="Added Prompt",
- value="best quality, extremely detailed")
+ a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
n_prompt = gr.Textbox(
label="Negative Prompt",
value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
)
with gr.Column():
- result_gallery = gr.Gallery(
- label="Output", show_label=False, elem_id="gallery").style(
- grid=2, height="auto")
+ result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+ grid=2, height="auto"
+ )
ips = [
input_image,
prompt,
diff --git a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
index f94a1bebdee43..34910428889af 100644
--- a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
+++ b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
@@ -16,10 +16,14 @@
import os
import paddle
-from control import (ControlNet, ControlNetTrainer, DataArguments,
- Fill50kDataset, ModelArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
- get_last_checkpoint)
+from control import (
+ ControlNet,
+ ControlNetTrainer,
+ DataArguments,
+ Fill50kDataset,
+ ModelArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
from paddlenlp.utils.log import logger
@@ -29,15 +33,14 @@ def unfreeze_params(params):
def main():
- parser = PdArgumentParser(
- (ModelArguments, DataArguments, TrainingArguments))
+ parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# report to custom_visualdl
training_args.report_to = ["custom_visualdl"]
training_args.resolution = data_args.resolution
training_args.image_logging_steps = model_args.image_logging_steps = (
- math.ceil(model_args.image_logging_steps / training_args.logging_steps)
- * training_args.logging_steps)
+ math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+ )
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
@@ -45,16 +48,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -67,7 +68,8 @@ def main():
model=model,
args=training_args,
train_dataset=train_dataset,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
# must set recompute after trainer init
trainer.model.set_recompute(training_args.recompute)
@@ -76,7 +78,8 @@ def main():
trainer.model.controlnet.parameters(),
trainer.model.unet.up_blocks.parameters(),
trainer.model.unet.conv_norm_out.parameters(),
- trainer.model.unet.conv_out.parameters(), )
+ trainer.model.unet.conv_out.parameters(),
+ )
unfreeze_params(params_to_train)
else:
params_to_train = trainer.model.controlnet.parameters()
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth.py b/ppdiffusers/examples/dreambooth/train_dreambooth.py
index f9e184fbf53a3..1aedbd3d57952 100644
--- a/ppdiffusers/examples/dreambooth/train_dreambooth.py
+++ b/ppdiffusers/examples/dreambooth/train_dreambooth.py
@@ -29,10 +29,10 @@
import paddle.nn as nn
import paddle.nn.functional as F
from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
- fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
- DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+ fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.vision import BaseTransform, transforms
from paddlenlp.trainer import set_seed
@@ -41,8 +41,13 @@
from PIL import Image
from tqdm.auto import tqdm
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
- UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDPMScheduler,
+ DiffusionPipeline,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.models.modeling_utils import freeze_params, unwrap_model
from ppdiffusers.optimization import get_scheduler
from ppdiffusers.utils import check_min_version
@@ -52,8 +57,7 @@
def url_or_path_join(*path_list):
- return (os.path.join(*path_list)
- if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+ return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
class Lambda(BaseTransform):
@@ -65,11 +69,11 @@ def _apply_image(self, img):
return self.fn(img)
-def import_model_class_from_model_name_or_path(
- pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
try:
text_encoder_config = PretrainedConfig.from_pretrained(
- url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+ url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+ )
model_class = text_encoder_config.architectures[0]
except Exception:
model_class = "LDMBertModel"
@@ -78,8 +82,9 @@ def import_model_class_from_model_name_or_path(
return CLIPTextModel
elif model_class == "RobertaSeriesModelWithTransformation":
- from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
return RobertaSeriesModelWithTransformation
elif model_class == "BertModel":
@@ -87,8 +92,9 @@ def import_model_class_from_model_name_or_path(
return BertModel
elif model_class == "LDMBertModel":
- from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+ LDMBertModel,
+ )
return LDMBertModel
else:
@@ -104,8 +110,7 @@ def fn(layer):
# unet
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
model.apply(fn)
@@ -125,8 +130,7 @@ def get_report_to(args):
def parse_args(input_args=None):
- parser = argparse.ArgumentParser(
- description="Simple example of a training dreambooth script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training dreambooth script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -145,19 +149,22 @@ def parse_args(input_args=None):
type=str,
default=None,
required=True,
- help="A folder containing the training data of instance images.", )
+ help="A folder containing the training data of instance images.",
+ )
parser.add_argument(
"--class_data_dir",
type=str,
default=None,
required=False,
- help="A folder containing the training data of class images.", )
+ help="A folder containing the training data of class images.",
+ )
parser.add_argument(
"--instance_prompt",
type=str,
default=None,
required=True,
- help="The prompt with identifier specifying the instance", )
+ help="The prompt with identifier specifying the instance",
+ )
parser.add_argument(
"--class_prompt",
type=str,
@@ -168,12 +175,14 @@ def parse_args(input_args=None):
"--with_prior_preservation",
default=False,
action="store_true",
- help="Flag to add prior preservation loss.", )
+ help="Flag to add prior preservation loss.",
+ )
parser.add_argument(
"--prior_loss_weight",
type=float,
default=1.0,
- help="The weight of prior preservation loss.", )
+ help="The weight of prior preservation loss.",
+ )
parser.add_argument(
"--num_class_images",
type=int,
@@ -181,39 +190,42 @@ def parse_args(input_args=None):
help=(
"Minimal class images for prior preservation loss. If there are not enough images already present in"
" class_data_dir, additional images will be sampled with class_prompt."
- ), )
+ ),
+ )
parser.add_argument(
"--output_dir",
type=str,
default="./dreambooth-model",
help="The output directory where the model predictions and checkpoints will be written.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--height",
type=int,
default=None,
help=(
"The height for input images, all the images in the train/validation dataset will be resized to this"
- " height"), )
+ " height"
+ ),
+ )
parser.add_argument(
"--width",
type=int,
default=None,
help=(
"The width for input images, all the images in the train/validation dataset will be resized to this"
- " width"), )
+ " width"
+ ),
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--center_crop",
default=False,
@@ -221,11 +233,13 @@ def parse_args(input_args=None):
help=(
"Whether to center crop the input images to the resolution. If not set, the images will be randomly"
" cropped. The images will be resized to the resolution first before cropping."
- ), )
+ ),
+ )
parser.add_argument(
"--random_flip",
action="store_true",
- help="whether to randomly flip images horizontally", )
+ help="whether to randomly flip images horizontally",
+ )
parser.add_argument(
"--train_text_encoder",
action="store_true",
@@ -235,12 +249,14 @@ def parse_args(input_args=None):
"--train_batch_size",
type=int,
default=4,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument(
"--sample_batch_size",
type=int,
default=4,
- help="Batch size (per device) for sampling images.", )
+ help="Batch size (per device) for sampling images.",
+ )
parser.add_argument("--num_train_epochs", type=int, default=1)
parser.add_argument(
"--max_train_steps",
@@ -277,12 +293,15 @@ def parse_args(input_args=None):
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--lr_num_cycles",
type=int,
@@ -293,45 +312,47 @@ def parse_args(input_args=None):
"--lr_power",
type=float,
default=1.0,
- help="Power factor of the polynomial scheduler.", )
+ help="Power factor of the polynomial scheduler.",
+ )
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ), )
+ ),
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.9,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
- parser.add_argument(
- "--adam_weight_decay",
- type=float,
- default=1e-2,
- help="Weight decay to use.")
+ help="The beta2 parameter for the Adam optimizer.",
+ )
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer", )
- parser.add_argument(
- "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer",
+ )
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -344,27 +365,28 @@ def parse_args(input_args=None):
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
choices=["tensorboard", "visualdl"],
- help="Log writer type.", )
+ help="Log writer type.",
+ )
parser.add_argument(
"--checkpointing_steps",
type=int,
default=500,
- help=("Save a checkpoint of the training state every X updates."), )
+ help=("Save a checkpoint of the training state every X updates."),
+ )
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
- parser.add_argument(
- "--noise_offset",
- type=float,
- default=1.0,
- help="The scale of noise offset.")
+ help="Whether or not to use xformers.",
+ )
+ parser.add_argument("--noise_offset", type=float, default=1.0, help="The scale of noise offset.")
if input_args is not None:
args = parser.parse_args(input_args)
@@ -376,20 +398,15 @@ def parse_args(input_args=None):
if args.with_prior_preservation:
if args.class_data_dir is None:
- raise ValueError(
- "You must specify a data directory for class images.")
+ raise ValueError("You must specify a data directory for class images.")
if args.class_prompt is None:
raise ValueError("You must specify prompt for class images.")
else:
# logger is not available yet
if args.class_data_dir is not None:
- warnings.warn(
- "You need not use --class_data_dir without --with_prior_preservation."
- )
+ warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
if args.class_prompt is not None:
- warnings.warn(
- "You need not use --class_prompt without --with_prior_preservation."
- )
+ warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
if args.height is None or args.width is None and args.resolution is not None:
@@ -405,18 +422,19 @@ class DreamBoothDataset(Dataset):
"""
def __init__(
- self,
- instance_data_root,
- instance_prompt,
- tokenizer,
- class_data_root=None,
- class_prompt=None,
- class_num=None,
- height=512,
- width=512,
- center_crop=False,
- interpolation="bilinear",
- random_flip=False, ):
+ self,
+ instance_data_root,
+ instance_prompt,
+ tokenizer,
+ class_data_root=None,
+ class_prompt=None,
+ class_num=None,
+ height=512,
+ width=512,
+ center_crop=False,
+ interpolation="bilinear",
+ random_flip=False,
+ ):
self.height = height
self.width = width
self.center_crop = center_crop
@@ -442,8 +460,7 @@ def __init__(
if any(suffix in p.name for suffix in ext):
self.class_images_path.append(p)
if class_num is not None:
- self.num_class_images = min(
- len(self.class_images_path), class_num)
+ self.num_class_images = min(len(self.class_images_path), class_num)
else:
self.num_class_images = len(self.class_images_path)
self._length = max(self.num_class_images, self.num_instance_images)
@@ -451,24 +468,22 @@ def __init__(
else:
self.class_data_root = None
- self.image_transforms = transforms.Compose([
- transforms.Resize(
- (height, width), interpolation=interpolation),
- transforms.CenterCrop((height, width))
- if center_crop else transforms.RandomCrop((height, width)),
- transforms.RandomHorizontalFlip()
- if random_flip else Lambda(lambda x: x),
- transforms.ToTensor(),
- transforms.Normalize([0.5], [0.5]),
- ])
+ self.image_transforms = transforms.Compose(
+ [
+ transforms.Resize((height, width), interpolation=interpolation),
+ transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
+ transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
def __len__(self):
return self._length
def __getitem__(self, index):
example = {}
- instance_image = Image.open(self.instance_images_path[
- index % self.num_instance_images])
+ instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
@@ -477,11 +492,11 @@ def __getitem__(self, index):
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
- return_attention_mask=False, ).input_ids
+ return_attention_mask=False,
+ ).input_ids
if self.class_data_root:
- class_image = Image.open(self.class_images_path[
- index % self.num_class_images])
+ class_image = Image.open(self.class_images_path[index % self.num_class_images])
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
@@ -490,7 +505,8 @@ def __getitem__(self, index):
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
- return_attention_mask=False, ).input_ids
+ return_attention_mask=False,
+ ).input_ids
return example
@@ -512,9 +528,7 @@ def __getitem__(self, index):
return example
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -547,45 +561,43 @@ def main():
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
safety_checker=None,
- requires_safety_checker=False, )
- if (args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ requires_safety_checker=False,
+ )
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
pipeline.unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
pipeline.set_progress_bar_config(disable=True)
num_new_images = args.num_class_images - cur_class_images
logger.info(f"Number of class images to sample: {num_new_images}.")
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
- batch_sampler = (DistributedBatchSampler(
- sample_dataset,
- batch_size=args.sample_batch_size,
- shuffle=False) if num_processes > 1 else BatchSampler(
- sample_dataset,
- batch_size=args.sample_batch_size,
- shuffle=False))
+ batch_sampler = (
+ DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+ if num_processes > 1
+ else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+ )
sample_dataloader = DataLoader(
sample_dataset,
batch_sampler=batch_sampler,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
for example in tqdm(
- sample_dataloader,
- desc="Generating class images",
- disable=not is_main_process, ):
+ sample_dataloader,
+ desc="Generating class images",
+ disable=not is_main_process,
+ ):
images = pipeline(example["prompt"]).images
for i, image in enumerate(images):
hash_image = hashlib.sha1(image.tobytes()).hexdigest()
- image_filename = (
- class_images_dir /
- f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
- )
+ image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
image.save(image_filename)
pipeline.to("cpu")
del pipeline
@@ -597,17 +609,14 @@ def main():
if args.push_to_hub:
if args.hub_model_id is None:
- repo_name = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
create_repo(repo_name, exist_ok=True, token=args.hub_token)
- repo = Repository(
- args.output_dir, clone_from=repo_name, token=args.hub_token)
+ repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
- with open(os.path.join(args.output_dir, ".gitignore"),
- "w+") as gitignore:
+ with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
gitignore.write("step_*\n")
if "epoch_*" not in gitignore:
@@ -617,30 +626,26 @@ def main():
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
elif args.pretrained_model_name_or_path:
- tokenizer = AutoTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+ tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
# import correct text encoder class
- text_encoder_cls = import_model_class_from_model_name_or_path(
- args.pretrained_model_name_or_path)
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
# Load scheduler and models
- noise_scheduler = DDPMScheduler.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="scheduler")
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
text_encoder = text_encoder_cls.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
- text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
- else text_encoder.config.to_dict())
- if (text_config.get("use_attention_mask", None) is not None and
- text_config["use_attention_mask"]):
+ url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+ )
+ text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+ if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
use_attention_mask = True
else:
use_attention_mask = False
- vae = AutoencoderKL.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="vae")
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
- subfolder="unet", )
+ subfolder="unet",
+ )
freeze_params(vae.parameters())
if not args.train_text_encoder:
@@ -650,21 +655,20 @@ def main():
if args.train_text_encoder:
set_recompute(text_encoder, True)
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
# Dataset and DataLoaders creation:
train_dataset = DreamBoothDataset(
instance_data_root=args.instance_data_dir,
instance_prompt=args.instance_prompt,
- class_data_root=args.class_data_dir
- if args.with_prior_preservation else None,
+ class_data_root=args.class_data_dir if args.with_prior_preservation else None,
class_prompt=args.class_prompt,
class_num=args.num_class_images,
tokenizer=tokenizer,
@@ -672,7 +676,8 @@ def main():
width=args.width,
center_crop=args.center_crop,
interpolation="bilinear",
- random_flip=args.random_flip, )
+ random_flip=args.random_flip,
+ )
def collate_fn(examples):
input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -687,38 +692,35 @@ def collate_fn(examples):
pixel_values = paddle.stack(pixel_values).astype("float32")
input_ids = tokenizer.pad(
- {
- "input_ids": input_ids
- },
+ {"input_ids": input_ids},
padding="max_length",
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.train_batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.train_batch_size,
- shuffle=True))
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ )
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
# Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if num_processes > 1:
unet = paddle.DataParallel(unet)
@@ -726,23 +728,22 @@ def collate_fn(examples):
text_encoder = paddle.DataParallel(text_encoder)
params_to_optimize = (
- list(unet.parameters()) + list(text_encoder.parameters())
- if args.train_text_encoder else unet.parameters())
+ list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+ )
if args.scale_lr:
- args.learning_rate = (args.learning_rate *
- args.gradient_accumulation_steps *
- args.train_batch_size * num_processes)
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+ )
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps *
- args.gradient_accumulation_steps,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
num_cycles=args.lr_num_cycles,
- power=args.lr_power, )
+ power=args.lr_power,
+ )
# Initialize the optimizer
optimizer = AdamW(
learning_rate=lr_scheduler,
@@ -751,8 +752,8 @@ def collate_fn(examples):
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if is_main_process:
logger.info("----------- Configuration Arguments -----------")
@@ -762,25 +763,19 @@ def collate_fn(examples):
writer = get_report_to(args)
# Train!
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not is_main_process)
+ progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
progress_bar.set_description("Train Steps")
global_step = 0
@@ -803,22 +798,24 @@ def collate_fn(examples):
if args.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=latents.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+ )
batch_size = latents.shape[0]
# Sample a random timestep for each image
timesteps = paddle.randint(
0,
noise_scheduler.config.num_train_timesteps,
- (batch_size, ),
- dtype="int64", )
+ (batch_size,),
+ dtype="int64",
+ )
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
- if num_processes > 1 and (args.gradient_checkpointing or (
- (step + 1) % args.gradient_accumulation_steps != 0)):
+ if num_processes > 1 and (
+ args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+ ):
# grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
# gradient_checkpointing, no_sync every where
# gradient_checkpointing + grad_acc, no_sync every where
@@ -826,55 +823,45 @@ def collate_fn(examples):
if args.train_text_encoder:
text_encoder_ctx_manager = text_encoder.no_sync()
else:
- text_encoder_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7)
- else contextlib.suppress())
+ text_encoder_ctx_manager = (
+ contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ )
else:
- unet_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
- text_encoder_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ text_encoder_ctx_manager = (
+ contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ )
with text_encoder_ctx_manager:
# Get the text embedding for conditioning
if use_attention_mask:
- attention_mask = (batch["input_ids"] !=
- tokenizer.pad_token_id).cast("int64")
+ attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
else:
attention_mask = None
- encoder_hidden_states = text_encoder(
- batch["input_ids"], attention_mask=attention_mask)[0]
+ encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
with unet_ctx_manager:
# Predict the noise residual / sample
- model_pred = unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise,
- timesteps)
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {noise_scheduler.config.prediction_type}"
- )
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
if args.with_prior_preservation:
# Chunk the noise and model_pred into two parts and compute the loss on each part separately.
- model_pred, model_pred_prior = model_pred.chunk(
- 2, axis=0)
+ model_pred, model_pred_prior = model_pred.chunk(2, axis=0)
target, target_prior = target.chunk(2, axis=0)
# Compute instance loss
loss = F.mse_loss(model_pred, target, reduction="mean")
# Compute prior loss
- prior_loss = F.mse_loss(
- model_pred_prior, target_prior, reduction="mean")
+ prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
# Add the prior loss to the instance loss.
loss = loss + args.prior_loss_weight * prior_loss
@@ -908,13 +895,10 @@ def collate_fn(examples):
writer.add_scalar(f"train/{name}", val, global_step)
if global_step % args.checkpointing_steps == 0:
- save_path = os.path.join(args.output_dir,
- f"checkpoint-{global_step}")
- unwrap_model(unet).save_pretrained(
- os.path.join(save_path, "unet"))
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+ unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
if args.train_text_encoder:
- unwrap_model(text_encoder).save_pretrained(
- os.path.join(save_path, "text_encoder"))
+ unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
if global_step >= args.max_train_steps:
break
@@ -926,14 +910,12 @@ def collate_fn(examples):
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=unwrap_model(unet),
- text_encoder=unwrap_model(text_encoder), )
+ text_encoder=unwrap_model(text_encoder),
+ )
pipeline.save_pretrained(args.output_dir)
if args.push_to_hub:
- repo.push_to_hub(
- commit_message="End of training",
- blocking=False,
- auto_lfs_prune=True)
+ repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
index b8837db5fb804..b36bc8b8f2130 100644
--- a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
+++ b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
@@ -32,10 +32,10 @@
import paddle.nn.functional as F
import requests
from huggingface_hub import HfFolder, create_repo, upload_folder, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
- fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
- DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+ fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.vision import BaseTransform, transforms
from paddlenlp.trainer import set_seed
@@ -44,12 +44,21 @@
from PIL import Image
from tqdm.auto import tqdm
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
- DPMSolverMultistepScheduler, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDPMScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
from ppdiffusers.models.attention_processor import (
- AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5)
+ AttnProcessor,
+ AttnProcessor2_5,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+)
from ppdiffusers.optimization import get_scheduler
from ppdiffusers.training_utils import freeze_params, unwrap_model
from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version
@@ -62,14 +71,14 @@
def _retry(
- func,
- func_args: Optional[tuple]=None,
- func_kwargs: Optional[dict]=None,
- exceptions: Type[requests.exceptions.RequestException]=requests.
- exceptions.RequestException,
- max_retries: int=0,
- base_wait_time: float=0.5,
- max_wait_time: float=2, ):
+ func,
+ func_args: Optional[tuple] = None,
+ func_kwargs: Optional[dict] = None,
+ exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException,
+ max_retries: int = 0,
+ base_wait_time: float = 0.5,
+ max_wait_time: float = 2,
+):
func_args = func_args or ()
func_kwargs = func_kwargs or {}
retry = 0
@@ -80,27 +89,24 @@ def _retry(
if retry >= max_retries:
raise err
else:
- sleep_time = min(max_wait_time, base_wait_time * 2
- **retry) # Exponential backoff
- logger.info(
- f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]"
- )
+ sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff
+ logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]")
time.sleep(sleep_time)
retry += 1
def url_or_path_join(*path_list):
- return (os.path.join(*path_list)
- if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+ return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
def save_model_card(
- repo_id: str,
- images=None,
- base_model=str,
- train_text_encoder=False,
- prompt=str,
- repo_folder=None, ):
+ repo_id: str,
+ images=None,
+ base_model=str,
+ train_text_encoder=False,
+ prompt=str,
+ repo_folder=None,
+):
img_str = ""
for i, image in enumerate(images):
image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -133,11 +139,11 @@ def save_model_card(
f.write(yaml + model_card)
-def import_model_class_from_model_name_or_path(
- pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
try:
text_encoder_config = PretrainedConfig.from_pretrained(
- url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+ url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+ )
model_class = text_encoder_config.architectures[0]
except Exception:
model_class = "LDMBertModel"
@@ -146,8 +152,9 @@ def import_model_class_from_model_name_or_path(
return CLIPTextModel
elif model_class == "RobertaSeriesModelWithTransformation":
- from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
return RobertaSeriesModelWithTransformation
elif model_class == "BertModel":
@@ -155,8 +162,9 @@ def import_model_class_from_model_name_or_path(
return BertModel
elif model_class == "LDMBertModel":
- from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+ LDMBertModel,
+ )
return LDMBertModel
else:
@@ -187,8 +195,7 @@ def get_report_to(args):
def parse_args(input_args=None):
- parser = argparse.ArgumentParser(
- description="Simple example of a training dreambooth lora script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training dreambooth lora script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -207,19 +214,22 @@ def parse_args(input_args=None):
type=str,
default=None,
required=True,
- help="A folder containing the training data of instance images.", )
+ help="A folder containing the training data of instance images.",
+ )
parser.add_argument(
"--class_data_dir",
type=str,
default=None,
required=False,
- help="A folder containing the training data of class images.", )
+ help="A folder containing the training data of class images.",
+ )
parser.add_argument(
"--instance_prompt",
type=str,
default=None,
required=True,
- help="The prompt with identifier specifying the instance", )
+ help="The prompt with identifier specifying the instance",
+ )
parser.add_argument(
"--class_prompt",
type=str,
@@ -230,7 +240,8 @@ def parse_args(input_args=None):
"--validation_prompt",
type=str,
default=None,
- help="A prompt that is sampled during training for inference.", )
+ help="A prompt that is sampled during training for inference.",
+ )
parser.add_argument(
"--num_validation_images",
type=int,
@@ -244,17 +255,20 @@ def parse_args(input_args=None):
help=(
"Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
" `args.validation_prompt` multiple times: `args.num_validation_images`."
- ), )
+ ),
+ )
parser.add_argument(
"--with_prior_preservation",
default=False,
action="store_true",
- help="Flag to add prior preservation loss.", )
+ help="Flag to add prior preservation loss.",
+ )
parser.add_argument(
"--prior_loss_weight",
type=float,
default=1.0,
- help="The weight of prior preservation loss.", )
+ help="The weight of prior preservation loss.",
+ )
parser.add_argument(
"--num_class_images",
type=int,
@@ -262,44 +276,48 @@ def parse_args(input_args=None):
help=(
"Minimal class images for prior preservation loss. If there are not enough images already present in"
" class_data_dir, additional images will be sampled with class_prompt."
- ), )
+ ),
+ )
parser.add_argument(
"--output_dir",
type=str,
default="lora-dreambooth-model",
help="The output directory where the model predictions and checkpoints will be written.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--height",
type=int,
default=None,
help=(
"The height for input images, all the images in the train/validation dataset will be resized to this"
- " height"), )
+ " height"
+ ),
+ )
parser.add_argument(
"--width",
type=int,
default=None,
help=(
"The width for input images, all the images in the train/validation dataset will be resized to this"
- " width"), )
+ " width"
+ ),
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--lora_rank",
type=int,
default=4,
- help="The rank of lora linear.", )
+ help="The rank of lora linear.",
+ )
parser.add_argument(
"--center_crop",
default=False,
@@ -307,16 +325,19 @@ def parse_args(input_args=None):
help=(
"Whether to center crop the input images to the resolution. If not set, the images will be randomly"
" cropped. The images will be resized to the resolution first before cropping."
- ), )
+ ),
+ )
parser.add_argument(
"--random_flip",
action="store_true",
- help="whether to randomly flip images horizontally", )
+ help="whether to randomly flip images horizontally",
+ )
parser.add_argument(
"--train_batch_size",
type=int,
default=4,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument(
"--train_text_encoder",
action="store_true",
@@ -326,7 +347,8 @@ def parse_args(input_args=None):
"--sample_batch_size",
type=int,
default=4,
- help="Batch size (per device) for sampling images.", )
+ help="Batch size (per device) for sampling images.",
+ )
parser.add_argument("--num_train_epochs", type=int, default=1)
parser.add_argument(
"--max_train_steps",
@@ -338,7 +360,8 @@ def parse_args(input_args=None):
"--checkpointing_steps",
type=int,
default=500,
- help=("Save a checkpoint of the training state every X updates."), )
+ help=("Save a checkpoint of the training state every X updates."),
+ )
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
@@ -368,12 +391,15 @@ def parse_args(input_args=None):
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--lr_num_cycles",
type=int,
@@ -384,45 +410,47 @@ def parse_args(input_args=None):
"--lr_power",
type=float,
default=1.0,
- help="Power factor of the polynomial scheduler.", )
+ help="Power factor of the polynomial scheduler.",
+ )
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ), )
+ ),
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.9,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
- parser.add_argument(
- "--adam_weight_decay",
- type=float,
- default=1e-2,
- help="Weight decay to use.")
+ help="The beta2 parameter for the Adam optimizer.",
+ )
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer", )
- parser.add_argument(
- "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer",
+ )
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -435,22 +463,22 @@ def parse_args(input_args=None):
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
choices=["tensorboard", "visualdl"],
- help="Log writer type.", )
+ help="Log writer type.",
+ )
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
- parser.add_argument(
- "--noise_offset",
- type=float,
- default=0,
- help="The scale of noise offset.")
+ help="Whether or not to use xformers.",
+ )
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
if input_args is not None:
args = parser.parse_args(input_args)
@@ -462,20 +490,15 @@ def parse_args(input_args=None):
if args.with_prior_preservation:
if args.class_data_dir is None:
- raise ValueError(
- "You must specify a data directory for class images.")
+ raise ValueError("You must specify a data directory for class images.")
if args.class_prompt is None:
raise ValueError("You must specify prompt for class images.")
else:
# logger is not available yet
if args.class_data_dir is not None:
- warnings.warn(
- "You need not use --class_data_dir without --with_prior_preservation."
- )
+ warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
if args.class_prompt is not None:
- warnings.warn(
- "You need not use --class_prompt without --with_prior_preservation."
- )
+ warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
if args.height is None or args.width is None and args.resolution is not None:
@@ -491,18 +514,19 @@ class DreamBoothDataset(Dataset):
"""
def __init__(
- self,
- instance_data_root,
- instance_prompt,
- tokenizer,
- class_data_root=None,
- class_prompt=None,
- class_num=None,
- height=512,
- width=512,
- center_crop=False,
- interpolation="bilinear",
- random_flip=False, ):
+ self,
+ instance_data_root,
+ instance_prompt,
+ tokenizer,
+ class_data_root=None,
+ class_prompt=None,
+ class_num=None,
+ height=512,
+ width=512,
+ center_crop=False,
+ interpolation="bilinear",
+ random_flip=False,
+ ):
self.height = height
self.width = width
self.center_crop = center_crop
@@ -528,8 +552,7 @@ def __init__(
if any(suffix in p.name for suffix in ext):
self.class_images_path.append(p)
if class_num is not None:
- self.num_class_images = min(
- len(self.class_images_path), class_num)
+ self.num_class_images = min(len(self.class_images_path), class_num)
else:
self.num_class_images = len(self.class_images_path)
self._length = max(self.num_class_images, self.num_instance_images)
@@ -537,24 +560,22 @@ def __init__(
else:
self.class_data_root = None
- self.image_transforms = transforms.Compose([
- transforms.Resize(
- (height, width), interpolation=interpolation),
- transforms.CenterCrop((height, width))
- if center_crop else transforms.RandomCrop((height, width)),
- transforms.RandomHorizontalFlip()
- if random_flip else Lambda(lambda x: x),
- transforms.ToTensor(),
- transforms.Normalize([0.5], [0.5]),
- ])
+ self.image_transforms = transforms.Compose(
+ [
+ transforms.Resize((height, width), interpolation=interpolation),
+ transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
+ transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
def __len__(self):
return self._length
def __getitem__(self, index):
example = {}
- instance_image = Image.open(self.instance_images_path[
- index % self.num_instance_images])
+ instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
@@ -563,11 +584,11 @@ def __getitem__(self, index):
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
- return_attention_mask=False, ).input_ids
+ return_attention_mask=False,
+ ).input_ids
if self.class_data_root:
- class_image = Image.open(self.class_images_path[
- index % self.num_class_images])
+ class_image = Image.open(self.class_images_path[index % self.num_class_images])
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
@@ -576,7 +597,8 @@ def __getitem__(self, index):
padding="do_not_pad",
truncation=True,
max_length=self.tokenizer.model_max_length,
- return_attention_mask=False, ).input_ids
+ return_attention_mask=False,
+ ).input_ids
return example
@@ -598,9 +620,7 @@ def __getitem__(self, index):
return example
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -633,45 +653,43 @@ def main():
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
safety_checker=None,
- requires_safety_checker=False, )
- if (args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ requires_safety_checker=False,
+ )
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
pipeline.unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warning(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
pipeline.set_progress_bar_config(disable=True)
num_new_images = args.num_class_images - cur_class_images
logger.info(f"Number of class images to sample: {num_new_images}.")
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
- batch_sampler = (DistributedBatchSampler(
- sample_dataset,
- batch_size=args.sample_batch_size,
- shuffle=False) if num_processes > 1 else BatchSampler(
- sample_dataset,
- batch_size=args.sample_batch_size,
- shuffle=False))
+ batch_sampler = (
+ DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+ if num_processes > 1
+ else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+ )
sample_dataloader = DataLoader(
sample_dataset,
batch_sampler=batch_sampler,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
for example in tqdm(
- sample_dataloader,
- desc="Generating class images",
- disable=not is_main_process, ):
+ sample_dataloader,
+ desc="Generating class images",
+ disable=not is_main_process,
+ ):
images = pipeline(example["prompt"]).images
for i, image in enumerate(images):
hash_image = hashlib.sha1(image.tobytes()).hexdigest()
- image_filename = (
- class_images_dir /
- f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
- )
+ image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
image.save(image_filename)
pipeline.to("cpu")
del pipeline
@@ -687,53 +705,50 @@ def main():
elif args.pretrained_model_name_or_path:
try:
tokenizer = AutoTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path,
- "tokenizer"))
+ url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")
+ )
except KeyError as e:
if "XLMRobertaTokenizer" in str(e):
from paddlenlp.transformers import XLMRobertaTokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path,
- "tokenizer"))
+ url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")
+ )
else:
raise e
# import correct text encoder class
- text_encoder_cls = import_model_class_from_model_name_or_path(
- args.pretrained_model_name_or_path)
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
# Load scheduler and models
- noise_scheduler = DDPMScheduler.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="scheduler")
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
text_encoder = text_encoder_cls.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
- text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
- else text_encoder.config.to_dict())
- if (text_config.get("use_attention_mask", None) is not None and
- text_config["use_attention_mask"]):
+ url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+ )
+ text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+ if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
use_attention_mask = True
else:
use_attention_mask = False
- vae = AutoencoderKL.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="vae")
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
- subfolder="unet", )
+ subfolder="unet",
+ )
# We only train the additional adapter LoRA layers
freeze_params(vae.parameters())
freeze_params(text_encoder.parameters())
freeze_params(unet.parameters())
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warning(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
# now we will add new LoRA weights to the attention layers
# It's important to realize here how many attention weights will be added and of which sizes
# The sizes of the attention layers consist only of two different variables:
@@ -750,14 +765,12 @@ def main():
# Set correct lora layers
unet_lora_attn_procs = {}
for name, attn_processor in unet.attn_processors.items():
- cross_attention_dim = (None if name.endswith("attn1.processor") else
- unet.config.cross_attention_dim)
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(unet.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = unet.config.block_out_channels[block_id]
@@ -767,14 +780,13 @@ def main():
elif isinstance(attn_processor, AttnProcessor2_5):
lora_attn_processor_class = LoRAAttnProcessor2_5
else:
- raise ValueError(
- f"Unknown attention processor type: {attn_processor.__class__.__name__}"
- )
+ raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
unet_lora_attn_procs[name] = lora_attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
- rank=args.lora_rank, )
+ rank=args.lora_rank,
+ )
unet.set_attn_processor(unet_lora_attn_procs)
unet_lora_layers = AttnProcsLayers(unet.attn_processors)
@@ -790,10 +802,12 @@ def main():
text_lora_attn_procs[name] = LoRAAttnProcessor(
hidden_size=module.out_proj.weight.shape[1],
cross_attention_dim=None,
- rank=args.lora_rank, )
+ rank=args.lora_rank,
+ )
text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
temp_pipeline = DiffusionPipeline.from_pretrained(
- args.pretrained_model_name_or_path, text_encoder=text_encoder)
+ args.pretrained_model_name_or_path, text_encoder=text_encoder
+ )
temp_pipeline._modify_text_encoder(text_lora_attn_procs)
text_encoder = temp_pipeline.text_encoder
del temp_pipeline
@@ -802,8 +816,7 @@ def main():
train_dataset = DreamBoothDataset(
instance_data_root=args.instance_data_dir,
instance_prompt=args.instance_prompt,
- class_data_root=args.class_data_dir
- if args.with_prior_preservation else None,
+ class_data_root=args.class_data_dir if args.with_prior_preservation else None,
class_prompt=args.class_prompt,
class_num=args.num_class_images,
tokenizer=tokenizer,
@@ -811,7 +824,8 @@ def main():
width=args.width,
center_crop=args.center_crop,
interpolation="bilinear",
- random_flip=args.random_flip, )
+ random_flip=args.random_flip,
+ )
def collate_fn(examples):
input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -826,58 +840,55 @@ def collate_fn(examples):
pixel_values = paddle.stack(pixel_values).astype("float32")
input_ids = tokenizer.pad(
- {
- "input_ids": input_ids
- },
+ {"input_ids": input_ids},
padding="max_length",
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.train_batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.train_batch_size,
- shuffle=True))
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ )
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
# Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if args.scale_lr:
- args.learning_rate = (args.learning_rate *
- args.gradient_accumulation_steps *
- args.train_batch_size * num_processes)
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+ )
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps *
- args.gradient_accumulation_steps,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
num_cycles=args.lr_num_cycles,
- power=args.lr_power, )
+ power=args.lr_power,
+ )
- params_to_optimize = (list(unet_lora_layers.parameters()) +
- list(text_encoder_lora_layers.parameters())
- if args.train_text_encoder else
- unet_lora_layers.parameters())
+ params_to_optimize = (
+ list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
+ if args.train_text_encoder
+ else unet_lora_layers.parameters()
+ )
# Optimizer creation
optimizer = AdamW(
learning_rate=lr_scheduler,
@@ -886,8 +897,8 @@ def collate_fn(examples):
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if num_processes > 1:
unet = paddle.DataParallel(unet)
@@ -902,25 +913,19 @@ def collate_fn(examples):
writer = get_report_to(args)
# Train!
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not is_main_process)
+ progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
progress_bar.set_description("Train Steps")
global_step = 0
vae.eval()
@@ -941,52 +946,43 @@ def collate_fn(examples):
if args.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=latents.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+ )
batch_size = latents.shape[0]
# Sample a random timestep for each image
- timesteps = paddle.randint(
- 0, noise_scheduler.config.num_train_timesteps,
- (batch_size, )).cast("int64")
+ timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
- if num_processes > 1 and (args.gradient_checkpointing or (
- (step + 1) % args.gradient_accumulation_steps != 0)):
+ if num_processes > 1 and (
+ args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+ ):
# grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
# gradient_checkpointing, no_sync every where
# gradient_checkpointing + grad_acc, no_sync every where
unet_ctx_manager = unet.no_sync()
else:
- unet_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
if use_attention_mask:
- attention_mask = (
- batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
+ attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
else:
attention_mask = None
- encoder_hidden_states = text_encoder(
- batch["input_ids"], attention_mask=attention_mask)[0]
+ encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
with unet_ctx_manager:
# Predict the noise residual / sample
- model_pred = unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise,
- timesteps)
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {noise_scheduler.config.prediction_type}"
- )
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
if args.with_prior_preservation:
# Chunk the noise and model_pred into two parts and compute the loss on each part separately.
@@ -997,8 +993,7 @@ def collate_fn(examples):
loss = F.mse_loss(model_pred, target, reduction="mean")
# Compute prior loss
- prior_loss = F.mse_loss(
- model_pred_prior, target_prior, reduction="mean")
+ prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
# Add the prior loss to the instance loss.
loss = loss + args.prior_loss_weight * prior_loss
@@ -1032,54 +1027,52 @@ def collate_fn(examples):
writer.add_scalar(f"train/{name}", val, global_step)
if global_step % args.checkpointing_steps == 0:
- save_path = os.path.join(args.output_dir,
- f"checkpoint-{global_step}")
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
# We combine the text encoder and UNet LoRA parameters with a simple
# custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
LoraLoaderMixin.save_lora_weights(
save_directory=save_path,
unet_lora_layers=unet_lora_layers,
- text_encoder_lora_layers=text_encoder_lora_layers, )
+ text_encoder_lora_layers=text_encoder_lora_layers,
+ )
logger.info(f"Saved lora weights to {save_path}")
if global_step >= args.max_train_steps:
break
if is_main_process:
- if (args.validation_prompt is not None and
- epoch % args.validation_epochs == 0):
+ if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
logger.info(
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
- f" {args.validation_prompt}.")
+ f" {args.validation_prompt}."
+ )
# create pipeline
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=unwrap_model(unet),
text_encoder=unwrap_model(text_encoder),
safety_checker=None,
- requires_safety_checker=False, )
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
- pipeline.scheduler.config)
+ requires_safety_checker=False,
+ )
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
pipeline.set_progress_bar_config(disable=True)
# run inference
- generator = (paddle.Generator().manual_seed(args.seed)
- if args.seed else None)
+ generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
images = [
pipeline(
args.validation_prompt,
num_inference_steps=25,
- generator=generator, ).images[0]
+ generator=generator,
+ ).images[0]
for _ in range(args.num_validation_images)
]
np_images = np.stack([np.asarray(img) for img in images])
if args.report_to == "tensorboard":
- writer.add_images(
- "test", np_images, epoch, dataformats="NHWC")
+ writer.add_images("test", np_images, epoch, dataformats="NHWC")
else:
- writer.add_image(
- "test", np_images, epoch, dataformats="NHWC")
+ writer.add_image("test", np_images, epoch, dataformats="NHWC")
del pipeline
if args.train_text_encoder:
@@ -1092,28 +1085,25 @@ def collate_fn(examples):
LoraLoaderMixin.save_lora_weights(
save_directory=args.output_dir,
unet_lora_layers=unet_lora_layers,
- text_encoder_lora_layers=text_encoder_lora_layers, )
+ text_encoder_lora_layers=text_encoder_lora_layers,
+ )
# Final inference
# Load previous pipeline
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
safety_checker=None,
- requires_safety_checker=False, )
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
- pipeline.scheduler.config)
+ requires_safety_checker=False,
+ )
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
# load attention processors
pipeline.load_lora_weights(args.output_dir)
# run inference
if args.validation_prompt and args.num_validation_images > 0:
- generator = paddle.Generator().manual_seed(
- args.seed) if args.seed else None
+ generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
images = [
- pipeline(
- args.validation_prompt,
- num_inference_steps=25,
- generator=generator).images[0]
+ pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
for _ in range(args.num_validation_images)
]
np_images = np.stack([np.asarray(img) for img in images])
@@ -1128,8 +1118,7 @@ def collate_fn(examples):
# logic to push to HF Hub
if args.push_to_hub:
if args.hub_model_id is None:
- repo_id = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_id = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_id = args.hub_model_id
@@ -1142,14 +1131,16 @@ def collate_fn(examples):
},
base_wait_time=1.0,
max_retries=5,
- max_wait_time=10.0, )
+ max_wait_time=10.0,
+ )
save_model_card(
repo_id,
images=images,
base_model=args.pretrained_model_name_or_path,
prompt=args.instance_prompt,
- repo_folder=args.output_dir, )
+ repo_folder=args.output_dir,
+ )
# Upload model
logger.info(f"Pushing to {repo_id}")
_retry(
@@ -1164,7 +1155,8 @@ def collate_fn(examples):
},
base_wait_time=1.0,
max_retries=5,
- max_wait_time=20.0, )
+ max_wait_time=20.0,
+ )
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
index 8da60623e57c1..fb7a20763c805 100644
--- a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
@@ -19,13 +19,9 @@
image = load_image(url)
text = "a red car in the sun"
-pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.remove_unused_weights()
text_to_image_strength = 0.75
-image = pipe(
- prompt=text, image=image,
- text_to_image_strength=text_to_image_strength).images[0]
-image.save(
- "dual_text_and_image_guided_generation-versatile_diffusion-result.png")
+image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
+image.save("dual_text_and_image_guided_generation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
index 59d805fed60f1..99812e2bd2122 100644
--- a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
+++ b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
@@ -29,7 +29,5 @@
# 使用fp16加快生成速度
with paddle.amp.auto_cast(True):
- image = pipe(
- image=init_image, mask_image=mask_image,
- example_image=example_image).images[0]
+ image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
image.save("image_guided_image_inpainting-paint_by_example-result.png")
diff --git a/ppdiffusers/examples/inference/image_inpainting-repaint.py b/ppdiffusers/examples/inference/image_inpainting-repaint.py
index 4e3cf9d1270c2..3d4a971fd734b 100644
--- a/ppdiffusers/examples/inference/image_inpainting-repaint.py
+++ b/ppdiffusers/examples/inference/image_inpainting-repaint.py
@@ -15,19 +15,15 @@
from ppdiffusers import RePaintPipeline, RePaintScheduler
from ppdiffusers.utils import load_image
-img_url = (
- "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
-)
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
# Load the original image and the mask as PIL images
original_image = load_image(img_url).resize((256, 256))
mask_image = load_image(mask_url).resize((256, 256))
-scheduler = RePaintScheduler.from_pretrained(
- "google/ddpm-ema-celebahq-256", subfolder="scheduler")
-pipe = RePaintPipeline.from_pretrained(
- "google/ddpm-ema-celebahq-256", scheduler=scheduler)
+scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
+pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
output = pipe(
image=original_image,
@@ -35,7 +31,8 @@
num_inference_steps=250,
eta=0.0,
jump_length=10,
- jump_n_sample=10, )
+ jump_n_sample=10,
+)
inpainted_image = output.images[0]
inpainted_image.save("image_inpainting-repaint-result.png")
diff --git a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
index 4889b99839ad0..ea5294247238b 100644
--- a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
@@ -28,17 +28,16 @@ def download_image(url):
# Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
-clip_model = CLIPModel.from_pretrained(
- "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16)
+feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16)
mixing_pipeline = DiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
custom_pipeline="clip_guided_images_mixing_stable_diffusion",
clip_model=clip_model,
feature_extractor=feature_extractor,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
mixing_pipeline.enable_attention_slicing()
# Pipline running
@@ -64,6 +63,7 @@ def download_image(url):
guidance_scale=9.0,
batch_size=1,
clip_guidance_scale=100,
- generator=generator, ).images
+ generator=generator,
+).images
pipe_images[0].save("clip_guided_images_mixing_stable_diffusion.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
index 537dc6cf71437..1525fc680c2c2 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
@@ -26,8 +26,6 @@
prompt = "奇幻的景观,以一种艺术的形式。"
# 使用fp16加快生成速度
with paddle.amp.auto_cast(True):
- image = pipe(
- prompt=prompt, image=init_image, strength=0.75,
- guidance_scale=7.5).images[0]
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
image.save("image_to_image_text_guided_generation-alt_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
index d1cf291ca57f0..b1d9267b2ac0d 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
@@ -19,9 +19,8 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
index 5f106c65341f3..bdd71eb35c00d 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
@@ -24,7 +24,8 @@
pipe = IFImg2ImgPipeline.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0",
variant="fp16",
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
pipe.enable_xformers_memory_efficient_attention()
prompt = "A fantasy landscape in style minecraft"
prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
@@ -33,25 +34,26 @@
image=original_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
- output_type="pd", ).images
+ output_type="pd",
+).images
pipe.to(paddle_device="cpu")
# save intermediate image
pil_image = pd_to_pil(image)
-pil_image[0].save(
- "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png")
+pil_image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png")
super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0",
text_encoder=None,
variant="fp16",
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
super_res_1_pipe.enable_xformers_memory_efficient_attention()
image = super_res_1_pipe(
image=image,
original_image=original_image,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_embeds, ).images
-image[0].save(
- "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png")
+ negative_prompt_embeds=negative_embeds,
+).images
+image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
index 8de116547d619..5b2d857d58b4a 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
@@ -18,8 +18,7 @@
from ppdiffusers.utils import load_image
# 加载pipeline
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
# 下载初始图片
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
@@ -29,8 +28,6 @@
prompt = "A fantasy landscape, trending on artstation"
# 使用fp16加快生成速度
with paddle.amp.auto_cast(True):
- image = pipe(
- prompt=prompt, image=init_image, strength=0.75,
- guidance_scale=7.5).images[0]
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
image.save("image_to_image_text_guided_generation-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
index 6103f2a54a722..67472607587b3 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
@@ -17,8 +17,7 @@
from ppdiffusers import StableDiffusionImg2ImgPipeline
from ppdiffusers.utils import load_image
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2")
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2")
# 下载初始图片
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
@@ -28,9 +27,6 @@
prompt = "A fantasy landscape, trending on artstation"
# 使用fp16加快生成速度
with paddle.amp.auto_cast(True):
- image = pipe(
- prompt=prompt, image=init_image, strength=0.75,
- guidance_scale=7.5).images[0]
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-image.save(
- "image_to_image_text_guided_generation-stable_diffusion_2-result.png")
+image.save("image_to_image_text_guided_generation-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
index 1f7dc26f085bc..1c7678b55930c 100644
--- a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
@@ -16,8 +16,7 @@
from ppdiffusers.utils import load_image
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image(
- "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
result = pipe(mode="i2t", image=image, prompt=None)
text = result.texts[0]
with open("image_to_text_generation-unidiffuser-result.txt", "w") as f:
diff --git a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
index 3d03fdb457501..a8478035c8c87 100644
--- a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
@@ -21,19 +21,21 @@
"lambdalabs/sd-image-variations-diffusers",
revision="v2.0",
from_diffusers=True,
- from_hf_hub=True, )
+ from_hf_hub=True,
+)
-im = load_image(
- "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+im = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
-tform = transforms.Compose([
- transforms.ToTensor(),
- transforms.Resize(
- (224, 224),
- interpolation="bicubic", ),
- transforms.Normalize([0.48145466, 0.4578275, 0.40821073],
- [0.26862954, 0.26130258, 0.27577711]),
-])
+tform = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ transforms.Resize(
+ (224, 224),
+ interpolation="bicubic",
+ ),
+ transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]),
+ ]
+)
inp = tform(im)
out = sd_pipe(im, guidance_scale=3)
diff --git a/ppdiffusers/examples/inference/image_variation-unidiffuser.py b/ppdiffusers/examples/inference/image_variation-unidiffuser.py
index d2bd06a9c5ec0..c334c673ff288 100644
--- a/ppdiffusers/examples/inference/image_variation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/image_variation-unidiffuser.py
@@ -16,8 +16,7 @@
from ppdiffusers.utils import load_image
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image(
- "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
result = pipe(mode="i2t2i", image=image, prompt=None)
image = result.images[0]
image.save("image_variation-unidiffuser-result.png")
diff --git a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
index 08c7fbfb6c409..3b2ec2596cbcb 100644
--- a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
@@ -18,8 +18,7 @@
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
image = load_image(url)
-pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
image = pipe(image).images[0]
image.save("image_variation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
index 79f9528c1a741..a986de034bc05 100644
--- a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
+++ b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
@@ -18,8 +18,7 @@
from ppdiffusers.utils import load_image
# 加载pipeline
-pipe = LDMSuperResolutionPipeline.from_pretrained(
- "CompVis/ldm-super-resolution-4x-openimages")
+pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
# 下载初始图片
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
diff --git a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
index b5d317b5abcce..b6b29f140e86d 100644
--- a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
@@ -16,8 +16,7 @@
from ppdiffusers import SemanticStableDiffusionPipeline
-pipe = SemanticStableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe.set_progress_bar_config(disable=None)
prompt = "a photo of a cat"
edit = {
@@ -38,6 +37,7 @@
guidance_scale=guidance_scale,
num_inference_steps=50,
width=512,
- height=512, )
+ height=512,
+)
image = output.images[0]
image.save("text_guided_generation-semantic_stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
index 013eeec9b316f..26115f88d6506 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
@@ -14,8 +14,7 @@
import paddle
-from ppdiffusers import (IFInpaintingPipeline,
- IFInpaintingSuperResolutionPipeline)
+from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline
from ppdiffusers.utils import load_image, pd_to_pil
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
@@ -24,8 +23,7 @@
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
mask_image = load_image(url)
-pipe = IFInpaintingPipeline.from_pretrained(
- "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
+pipe = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
pipe.enable_xformers_memory_efficient_attention()
prompt = "blue sunglasses"
prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
@@ -35,7 +33,8 @@
mask_image=mask_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
- output_type="pd", ).images
+ output_type="pd",
+).images
pipe.to(paddle_device="cpu")
# save intermediate image
pil_image = pd_to_pil(image)
@@ -45,7 +44,8 @@
"DeepFloyd/IF-II-L-v1.0",
text_encoder=None,
variant="fp16",
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
super_res_1_pipe.enable_xformers_memory_efficient_attention()
image = super_res_1_pipe(
@@ -53,5 +53,6 @@
mask_image=mask_image,
original_image=original_image,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_embeds, ).images
+ negative_prompt_embeds=negative_embeds,
+).images
image[0].save("./text_guided_image_inpainting-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
index dd2dde2fe504c..0fdfe1946a84f 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
@@ -23,13 +23,10 @@
init_image = load_image(img_url).resize((512, 512))
mask_image = load_image(mask_url).resize((512, 512))
-pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
prompt = "a cat sitting on a bench"
with paddle.amp.auto_cast(True):
- image = pipe(
- prompt=prompt, image=init_image, mask_image=mask_image,
- strength=0.75).images[0]
+ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images[0]
image.save("text_guided_image_inpainting-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
index c89ecf9f8de59..6b27f9a60cf88 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
@@ -21,8 +21,7 @@
init_image = load_image(img_url).resize((512, 512))
mask_image = load_image(mask_url).resize((512, 512))
-pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-inpainting")
+pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
# image and mask_image should be PIL images.
diff --git a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
index 736b2a2d09f37..de2298e710d3c 100644
--- a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
@@ -15,8 +15,7 @@
from ppdiffusers import StableDiffusionUpscalePipeline
from ppdiffusers.utils import load_image
-pipe = StableDiffusionUpscalePipeline.from_pretrained(
- "stabilityai/stable-diffusion-x4-upscaler")
+pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
low_res_img = load_image(url).resize((128, 128))
diff --git a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
index 511f0f55ac93b..2b4c1b1330a97 100644
--- a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
+++ b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
@@ -18,8 +18,7 @@
from ppdiffusers import AudioLDMPipeline
-pipe = AudioLDMPipeline.from_pretrained(
- "cvssp/audioldm", paddle_dtype=paddle.float16)
+pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16)
prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
index f6863fe8f4f8c..fccaff284995e 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
@@ -14,10 +14,8 @@
from ppdiffusers import AltDiffusionPipeline, DPMSolverMultistepScheduler
-scheduler = DPMSolverMultistepScheduler.from_pretrained(
- "BAAI/AltDiffusion", subfolder="scheduler")
-pipe = AltDiffusionPipeline.from_pretrained(
- "BAAI/AltDiffusion", scheduler=scheduler)
+scheduler = DPMSolverMultistepScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler)
prompt = "黑暗精灵公主,非常详细,幻想,非常详细,数字绘画,概念艺术,敏锐的焦点,插图"
# or in English:
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
index a016bbfbe1019..9b420b5aa57ba 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
@@ -21,15 +21,13 @@
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- controlnet=controlnet,
- safety_checker=None)
+ "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
resolution = 512
image = np.array(
- load_image(
- "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
- ))
+ load_image("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png")
+)
image = cv2.Canny(image, 100, 200)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
@@ -43,5 +41,6 @@
num_inference_steps=50,
height=resolution,
width=resolution,
- controlnet_conditioning_scale=1.0, ).images[0]
+ controlnet_conditioning_scale=1.0,
+).images[0]
image.save("text_to_image_generation-controlnet-result-bird_canny.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
index b060557c4a7cb..f55ded139341f 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
@@ -14,20 +14,19 @@
import paddle
-from ppdiffusers import (DiffusionPipeline, IFPipeline,
- IFSuperResolutionPipeline)
+from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
from ppdiffusers.utils import pd_to_pil
# Stage 1: generate images
-pipe = IFPipeline.from_pretrained(
- "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
+pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
pipe.enable_xformers_memory_efficient_attention()
prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
image = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
- output_type="pd", ).images
+ output_type="pd",
+).images
# save intermediate image
pil_image = pd_to_pil(image)
@@ -40,27 +39,30 @@
"DeepFloyd/IF-II-L-v1.0",
text_encoder=None,
variant="fp16",
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
super_res_1_pipe.enable_xformers_memory_efficient_attention()
image = super_res_1_pipe(
image=image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
- output_type="pd", ).images
+ output_type="pd",
+).images
# save intermediate image
pil_image = pd_to_pil(image)
-pil_image[0].save(
- "text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
+pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
# save gpu memory
super_res_1_pipe.to(paddle_device="cpu")
# Stage 3: super resolution stage2
super_res_2_pipe = DiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16)
+ "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16
+)
super_res_2_pipe.enable_xformers_memory_efficient_attention()
image = super_res_2_pipe(
prompt=prompt,
- image=image, ).images
+ image=image,
+).images
image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_III.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
index 89ebf5ee3570d..4a71ac1a6b273 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
@@ -15,10 +15,8 @@
from ppdiffusers import StableDiffusionPipelineSafe
from ppdiffusers.pipelines.stable_diffusion_safe import SafetyConfig
-pipe = StableDiffusionPipelineSafe.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionPipelineSafe.from_pretrained("runwayml/stable-diffusion-v1-5")
print(pipe.safety_concept)
prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
out = pipe(prompt=prompt, **SafetyConfig.MAX)
-out.images[0].save(
- "text_to_image_generation-stable_diffusion_safe-result.png.png")
+out.images[0].save("text_to_image_generation-stable_diffusion_safe-result.png.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
index 0c00344a7f602..0d0ef4e6ce819 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
@@ -16,21 +16,20 @@
from ppdiffusers import StableDiffusionAdapterPipeline, T2IAdapter
from ppdiffusers.utils import PIL_INTERPOLATION, load_image
-input_image = load_image(
- "https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png"
-)
+input_image = load_image("https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png")
color_palette = input_image.resize((8, 8))
-color_palette = color_palette.resize(
- (512, 512), resample=PIL_INTERPOLATION["nearest"])
+color_palette = color_palette.resize((512, 512), resample=PIL_INTERPOLATION["nearest"])
adapter = T2IAdapter.from_pretrained("westfish/sd-v1-4-adapter-color")
pipe = StableDiffusionAdapterPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
adapter=adapter,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+)
image = pipe(
prompt="At night, glowing cubes in front of the beach",
- image=color_palette, ).images[0]
+ image=color_palette,
+).images[0]
image.save("text_to_image_generation-t2i-adapter-result-color_adapter.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
index db8d5261d101a..d777a8ce31db3 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
@@ -14,8 +14,7 @@
from ppdiffusers import VersatileDiffusionTextToImagePipeline
-pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.remove_unused_weights()
image = pipe("an astronaut riding on a horse on mars").images[0]
diff --git a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
index cb4171be41abc..fd93408658d48 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
@@ -19,25 +19,30 @@
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+)
pipeline = DiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
scheduler=scheduler,
- custom_pipeline="mixture_tiling.py", )
+ custom_pipeline="mixture_tiling.py",
+)
pipeline
# Mixture of Diffusers generation
image = pipeline(
- prompt=[[
- "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
- "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
- "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
- ]],
+ prompt=[
+ [
+ "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+ "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+ "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+ ]
+ ],
tile_height=640,
tile_width=640,
tile_row_overlap=0,
tile_col_overlap=256,
guidance_scale=8,
seed=7178915308,
- num_inference_steps=50, )["images"][0]
+ num_inference_steps=50,
+)["images"][0]
image.save("mixture_tiling" + ".png")
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-synth.py b/ppdiffusers/examples/inference/text_to_video_generation-synth.py
index 9fd346c0f5bc1..e197cb41f426d 100644
--- a/ppdiffusers/examples/inference/text_to_video_generation-synth.py
+++ b/ppdiffusers/examples/inference/text_to_video_generation-synth.py
@@ -24,4 +24,5 @@
imageio.mimsave(
"text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4",
video_frames,
- fps=8, )
+ fps=8,
+)
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-zero.py b/ppdiffusers/examples/inference/text_to_video_generation-zero.py
index b26103c3f32e2..0e4efb3563d50 100644
--- a/ppdiffusers/examples/inference/text_to_video_generation-zero.py
+++ b/ppdiffusers/examples/inference/text_to_video_generation-zero.py
@@ -13,14 +13,14 @@
# limitations under the License.
import imageio
+
# pip install imageio[ffmpeg]
import paddle
from ppdiffusers import TextToVideoZeroPipeline
model_id = "runwayml/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(
- model_id, paddle_dtype=paddle.float16)
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
prompt = "A panda is playing guitar on times square"
result = pipe(prompt=prompt).images
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
index 92557d8d6e2f4..e1914bab67daa 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
@@ -18,8 +18,7 @@
from ppdiffusers import AudioDiffusionPipeline
# 加载模型和scheduler
-pipe = AudioDiffusionPipeline.from_pretrained(
- "teticio/audio-diffusion-ddim-256")
+pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(42)
@@ -29,8 +28,7 @@
# 保存音频到本地
for i, audio in enumerate(audio):
- write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate,
- audio.transpose())
+ write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, audio.transpose())
# 保存图片
image.save("unconditional_audio_generation-audio_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
index 051f61f892230..9114555e75a38 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
@@ -27,4 +27,5 @@
write(
f"unconditional_audio_generation-dance_diffusion-result_{i}.wav",
pipe.unet.sample_rate,
- audio.transpose(), )
+ audio.transpose(),
+ )
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
index d498dfbd88225..fe99d89347981 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
@@ -22,9 +22,9 @@
# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
mid_file_path = ppdiffusers_url_download(
"https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid",
- cache_dir=".", )
-pipe = SpectrogramDiffusionPipeline.from_pretrained(
- "google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
+ cache_dir=".",
+)
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
processor = MidiProcessor()
output = pipe(processor(mid_file_path))
audio = output.audios[0]
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
index 2e22c143e2271..90f93ac299ed4 100644
--- a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
+++ b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
@@ -16,8 +16,7 @@
scheduler = KarrasVeScheduler()
# 加载模型和scheduler
-pipe = KarrasVePipeline.from_pretrained(
- "google/ncsnpp-celebahq-256", scheduler=scheduler)
+pipe = KarrasVePipeline.from_pretrained("google/ncsnpp-celebahq-256", scheduler=scheduler)
# 执行pipeline进行推理
image = pipe().images
diff --git a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
index fec274338d9ad..38aed057ce167 100644
--- a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
@@ -19,6 +19,5 @@
image = result.images[0]
image.save("unconditional_image_text_generation-unidiffuser-result.png")
text = result.texts[0]
-with open("unconditional_image_text_generation-unidiffuser-result.txt",
- "w") as f:
+with open("unconditional_image_text_generation-unidiffuser-result.txt", "w") as f:
print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/stable_diffusion/generate_images.py b/ppdiffusers/examples/stable_diffusion/generate_images.py
index e20424e75e4ee..933fd0b771040 100644
--- a/ppdiffusers/examples/stable_diffusion/generate_images.py
+++ b/ppdiffusers/examples/stable_diffusion/generate_images.py
@@ -22,9 +22,14 @@
from paddlenlp.transformers import CLIPTextModel
from tqdm.auto import tqdm
-from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler,
- LMSDiscreteScheduler, PNDMScheduler,
- StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import DOWNLOAD_SERVER, PPDIFFUSERS_CACHE
base_url = DOWNLOAD_SERVER + "/CompVis/data/"
@@ -43,32 +48,30 @@ def batchify(data, batch_size=16):
def generate_images(
- unet_model_name_or_path,
- text_encoder_model_name_or_path=None,
- batch_size=16,
- file="coco30k.csv",
- save_path="output",
- seed=42,
- scheduler_type="ddim",
- eta=0.0,
- num_inference_steps=50,
- guidance_scales=[3, 4, 5, 6, 7, 8],
- height=256,
- width=256,
- device="gpu",
- variant="bf16", ):
+ unet_model_name_or_path,
+ text_encoder_model_name_or_path=None,
+ batch_size=16,
+ file="coco30k.csv",
+ save_path="output",
+ seed=42,
+ scheduler_type="ddim",
+ eta=0.0,
+ num_inference_steps=50,
+ guidance_scales=[3, 4, 5, 6, 7, 8],
+ height=256,
+ width=256,
+ device="gpu",
+ variant="bf16",
+):
paddle.set_device(device)
if variant == "fp32":
variant = None
- unet = UNet2DConditionModel.from_pretrained(
- unet_model_name_or_path, variant=variant)
+ unet = UNet2DConditionModel.from_pretrained(unet_model_name_or_path, variant=variant)
kwargs = {"safety_checker": None, "unet": unet}
if text_encoder_model_name_or_path is not None:
- text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_model_name_or_path, variant=variant)
+ text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_name_or_path, variant=variant)
kwargs["text_encoder"] = text_encoder
- pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", **kwargs)
+ pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", **kwargs)
pipe.set_progress_bar_config(disable=True)
beta_start = pipe.scheduler.beta_start
beta_end = pipe.scheduler.beta_end
@@ -80,17 +83,14 @@ def generate_images(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif scheduler_type == "euler-ancestral":
scheduler = EulerAncestralDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+ )
elif scheduler_type == "ddim":
scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -99,7 +99,8 @@ def generate_images(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
pipe.scheduler = scheduler
@@ -122,7 +123,8 @@ def generate_images(
eta=eta,
height=height,
width=width,
- num_inference_steps=num_inference_steps, )[0]
+ num_inference_steps=num_inference_steps,
+ )[0]
for image in images:
path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
image.save(path)
@@ -136,28 +138,33 @@ def generate_images(
default=None,
type=str,
required=True,
- help="unet_model_name_or_path.", )
+ help="unet_model_name_or_path.",
+ )
parser.add_argument(
"--text_encoder_model_name_or_path",
default=None,
type=str,
- help="text_encoder_model_name_or_path.", )
+ help="text_encoder_model_name_or_path.",
+ )
parser.add_argument(
"--file",
default="coco30k",
type=str,
- help="eval file.", )
+ help="eval file.",
+ )
parser.add_argument(
"--variant",
default="fp32",
type=str,
choices=["fp32", "bf16"],
- help="eval file.", )
+ help="eval file.",
+ )
parser.add_argument(
"--seed",
default=42,
type=int,
- help="random seed.", )
+ help="random seed.",
+ )
parser.add_argument(
"--scheduler_type",
default="ddim",
@@ -167,22 +174,15 @@ def generate_images(
)
parser.add_argument("--device", default="gpu", type=str, help="device")
parser.add_argument("--batch_size", default=16, type=int, help="batch_size")
- parser.add_argument(
- "--num_inference_steps",
- default=50,
- type=int,
- help="num_inference_steps")
- parser.add_argument(
- "--save_path",
- default="outputs",
- type=str,
- help="Path to the output file.")
+ parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps")
+ parser.add_argument("--save_path", default="outputs", type=str, help="Path to the output file.")
parser.add_argument(
"--guidance_scales",
default=[1.5, 2, 3, 4, 5, 6, 7, 8],
nargs="+",
type=str,
- help="guidance_scales list.", )
+ help="guidance_scales list.",
+ )
parser.add_argument("--height", default=256, type=int, help="height.")
parser.add_argument("--width", default=256, type=int, help="width.")
args = parser.parse_args()
@@ -210,4 +210,5 @@ def generate_images(
height=args.height,
width=args.width,
device=args.device,
- variant=args.variant, )
+ variant=args.variant,
+ )
diff --git a/ppdiffusers/examples/stable_diffusion/sd/model.py b/ppdiffusers/examples/stable_diffusion/sd/model.py
index 449a74df28ff4..bd0df892a83b1 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/model.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/model.py
@@ -21,8 +21,13 @@
from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
from paddlenlp.utils.log import logger
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.initializer import reset_initialized_parameter, zeros_
from ppdiffusers.models.attention import AttentionBlock
from ppdiffusers.models.ema import LitEma
@@ -37,30 +42,31 @@ def __init__(self, model_args):
self.model_args = model_args
tokenizer_name_or_path = (
model_args.tokenizer_name
- if model_args.tokenizer_name is not None else
- os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+ if model_args.tokenizer_name is not None
+ else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+ )
vae_name_or_path = (
model_args.vae_name_or_path
- if model_args.vae_name_or_path is not None else
- os.path.join(model_args.pretrained_model_name_or_path, "vae"))
+ if model_args.vae_name_or_path is not None
+ else os.path.join(model_args.pretrained_model_name_or_path, "vae")
+ )
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.text_encoder_name_or_path is not None else
- os.path.join(model_args.pretrained_model_name_or_path,
- "text_encoder"))
+ if model_args.text_encoder_name_or_path is not None
+ else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+ )
unet_name_or_path = (
model_args.unet_name_or_path
- if model_args.unet_name_or_path is not None else
- os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+ if model_args.unet_name_or_path is not None
+ else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+ )
# init model and tokenizer
tokenizer_kwargs = {}
if model_args.model_max_length is not None:
tokenizer_kwargs["model_max_length"] = model_args.model_max_length
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
- **tokenizer_kwargs)
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **tokenizer_kwargs)
self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
- self.text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_name_or_path)
+ self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
try:
self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
except Exception:
@@ -88,9 +94,9 @@ def __init__(self, model_args):
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
- prediction_type=self.model_args.prediction_type, )
- self.register_buffer("alphas_cumprod",
- self.noise_scheduler.alphas_cumprod)
+ prediction_type=self.model_args.prediction_type,
+ )
+ self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
self.eval_scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
@@ -99,7 +105,8 @@ def __init__(self, model_args):
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1,
- prediction_type=self.model_args.prediction_type, )
+ prediction_type=self.model_args.prediction_type,
+ )
self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps)
self.use_ema = False
self.model_ema = None
@@ -109,7 +116,7 @@ def compute_snr(self, timesteps):
Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
"""
sqrt_alphas_cumprod = self.alphas_cumprod**0.5
- sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5
+ sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5
# Expand the tensors.
# Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
@@ -118,15 +125,13 @@ def compute_snr(self, timesteps):
sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[
- timesteps].cast("float32")
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
- None]
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
# Compute SNR.
- snr = (alpha / sigma)**2
+ snr = (alpha / sigma) ** 2
return snr
def forward(self, input_ids=None, pixel_values=None, **kwargs):
@@ -143,14 +148,14 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
if self.model_args.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += self.model_args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
+ )
if self.model_args.input_perturbation:
- new_noise = noise + self.model_args.input_perturbation * paddle.randn(
- noise.shape, dtype=noise.dtype)
+ new_noise = noise + self.model_args.input_perturbation * paddle.randn(noise.shape, dtype=noise.dtype)
- timesteps = paddle.randint(
- 0, self.noise_scheduler.config.num_train_timesteps,
- (latents.shape[0], )).cast("int64")
+ timesteps = paddle.randint(0, self.noise_scheduler.config.num_train_timesteps, (latents.shape[0],)).cast(
+ "int64"
+ )
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
if self.model_args.input_perturbation:
@@ -165,7 +170,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
model_pred = self.unet(
sample=noisy_latents,
timestep=timesteps,
- encoder_hidden_states=encoder_hidden_states, ).sample
+ encoder_hidden_states=encoder_hidden_states,
+ ).sample
# Get the target for loss depending on the prediction type
if self.model_args.prediction_type == "epsilon":
@@ -173,64 +179,58 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
elif self.model_args.prediction_type == "v_prediction":
target = self.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {self.model_args.prediction_type}")
+ raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}")
# compute loss
if self.model_args.snr_gamma is None:
- loss = (F.mse_loss(
- model_pred.cast("float32"),
- target.cast("float32"),
- reduction="none").mean([1, 2, 3]).mean())
+ loss = (
+ F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
+ )
else:
# Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
# Since we predict the noise instead of x_0, the original formulation is slightly changed.
# This is discussed in Section 4.2 of the same paper.
snr = self.compute_snr(timesteps)
- mse_loss_weights = (paddle.stack(
- [snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)],
- axis=1, ).min(axis=1)[0] / snr)
+ mse_loss_weights = (
+ paddle.stack([snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min(
+ axis=1
+ )[0]
+ / snr
+ )
# We first calculate the original loss. Then we mean over the non-batch dimensions and
# rebalance the sample-wise losses with their respective loss weights.
# Finally, we take the mean of the rebalanced loss.
- loss = F.mse_loss(
- model_pred.cast("float32"),
- target.cast("float32"),
- reduction="none")
+ loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none")
loss = loss.mean(list(range(1, len(loss.shape)))) * mse_loss_weights
loss = loss.mean()
return loss
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
- def get_velocity(self,
- sample: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor) -> paddle.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -285,20 +285,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs):
@paddle.no_grad()
def log_image(
- self,
- input_ids=None,
- height=256,
- width=256,
- eta=0.0,
- guidance_scale=7.5,
- max_batch=8,
- **kwargs, ):
+ self,
+ input_ids=None,
+ height=256,
+ width=256,
+ eta=0.0,
+ guidance_scale=7.5,
+ max_batch=8,
+ **kwargs,
+ ):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log max_batch image
if input_ids.shape[0] > max_batch:
input_ids = input_ids[:max_batch]
@@ -311,34 +310,25 @@ def log_image(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings], axis=0)
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
- latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
- height // 8, width // 8))
+ latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
latents = latents * self.eval_scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for t in self.eval_scheduler.timesteps:
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
@@ -347,8 +337,7 @@ def log_image(
def set_recompute(self, use_recompute=False):
if use_recompute:
self.unet.enable_gradient_checkpointing()
- if self.model_args.train_text_encoder and hasattr(
- self.text_encoder, "gradient_checkpointing_enable"):
+ if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"):
self.text_encoder.gradient_checkpointing_enable()
def gradient_checkpointing_enable(self):
@@ -362,26 +351,21 @@ def set_xformers(self, use_xformers=False):
)
else:
try:
- attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP",
- "none").lower()
+ attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP", "none").lower()
if attention_op == "none":
attention_op = None
- self.unet.enable_xformers_memory_efficient_attention(
- attention_op)
- if hasattr(self.vae,
- "enable_xformers_memory_efficient_attention"):
- self.vae.enable_xformers_memory_efficient_attention(
- attention_op)
- if hasattr(self.text_encoder,
- "enable_xformers_memory_efficient_attention"):
- self.text_encoder.enable_xformers_memory_efficient_attention(
- attention_op)
+ self.unet.enable_xformers_memory_efficient_attention(attention_op)
+ if hasattr(self.vae, "enable_xformers_memory_efficient_attention"):
+ self.vae.enable_xformers_memory_efficient_attention(attention_op)
+ if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"):
+ self.text_encoder.enable_xformers_memory_efficient_attention(attention_op)
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
def set_ema(self, use_ema=False):
self.use_ema = use_ema
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
index d15e6b0894fe4..4ca34e749fc3f 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
@@ -34,39 +34,34 @@
@dataclass
class SDTrainingArguments(TrainingArguments):
- image_logging_steps: int = field(
- default=1000, metadata={"help": "Log image every X steps."})
- to_static: bool = field(
- default=False, metadata={"help": "Whether or not to_static"})
+ image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."})
+ to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
benchmark: bool = field(
default=False,
- metadata={"help": "Whether or not run benchmark."}, )
+ metadata={"help": "Whether or not run benchmark."},
+ )
profiler_options: Optional[str] = field(
default=None,
- metadata={"help": "profiler_options."}, )
+ metadata={"help": "profiler_options."},
+ )
report_to: Optional[List[str]] = field(
default_factory=lambda: ["custom_visualdl"],
- metadata={
- "help":
- "The list of integrations to report the results and logs to."
- }, )
+ metadata={"help": "The list of integrations to report the results and logs to."},
+ )
resolution: int = field(
default=512,
metadata={
- "help":
- "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
- }, )
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+ },
+ )
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable_xformers_memory_efficient_attention."})
+ default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+ )
only_save_updated_model: bool = field(
- default=True,
- metadata={"help": "Whether or not save only_save_updated_model"})
- unet_learning_rate: float = field(
- default=None,
- metadata={"help": "The initial learning rate for Unet Model."})
+ default=True, metadata={"help": "Whether or not save only_save_updated_model"}
+ )
+ unet_learning_rate: float = field(default=None, metadata={"help": "The initial learning rate for Unet Model."})
text_encoder_learning_rate: float = field(
default=None,
metadata={"help": "The initial learning rate for Text Encoder Model."},
@@ -75,19 +70,17 @@ class SDTrainingArguments(TrainingArguments):
def __post_init__(self):
super().__post_init__()
self.image_logging_steps = (
- (math.ceil(self.image_logging_steps / self.logging_steps) *
- self.logging_steps) if self.image_logging_steps > 0 else -1)
- self.use_ema = str2bool(os.getenv("FLAG_USE_EMA",
- "False")) or self.use_ema
+ (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps)
+ if self.image_logging_steps > 0
+ else -1
+ )
+ self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema
self.enable_xformers_memory_efficient_attention = (
- str2bool(os.getenv("FLAG_XFORMERS", "False")) or
- self.enable_xformers_memory_efficient_attention)
- self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or
- self.recompute)
- self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or
- self.benchmark)
- self.to_static = (str2bool(os.getenv("FLAG_TO_STATIC", "False")) or
- self.to_static)
+ str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention
+ )
+ self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute
+ self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark
+ self.to_static = str2bool(os.getenv("FLAG_TO_STATIC", "False")) or self.to_static
if self.text_encoder_learning_rate is None:
self.text_encoder_learning_rate = self.learning_rate
@@ -105,45 +98,34 @@ def __post_init__(self):
@dataclass
class SDModelArguments:
- vae_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "vae_name_or_path"})
- text_encoder_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "text_encoder_name_or_path"})
- unet_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "unet_name_or_path"})
+ vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"})
+ text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+ unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"})
tokenizer_name: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"
- }, )
+ metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"},
+ )
pretrained_model_name_or_path: str = field(
default="CompVis/stable-diffusion-v1-4",
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
- model_max_length: int = field(
- default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
+ model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
prediction_type: str = field(
default="epsilon",
metadata={
- "help":
- "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
- }, )
- num_inference_steps: int = field(
- default=50, metadata={"help": "num_inference_steps"})
- train_text_encoder: bool = field(
- default=False, metadata={"help": "Whether or not train text encoder"})
-
- noise_offset: float = field(
- default=0, metadata={"help": "The scale of noise offset."})
+ "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+ },
+ )
+ num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+ train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"})
+
+ noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."})
snr_gamma: Optional[float] = field(
default=None,
metadata={
- "help":
- "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
- }, )
+ "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
+ },
+ )
input_perturbation: Optional[float] = field(
default=0,
metadata={"help": "The scale of input perturbation. Recommended 0.1."},
@@ -158,14 +140,18 @@ class SDDataArguments:
file_list: str = field(
default="./data/filelist/train.filelist.list",
- metadata={"help": "The name of the file_list."}, )
+ metadata={"help": "The name of the file_list."},
+ )
num_records: int = field(default=10000000, metadata={"help": "num_records"})
buffer_size: int = field(
default=100,
- metadata={"help": "Buffer size"}, )
+ metadata={"help": "Buffer size"},
+ )
shuffle_every_n_samples: int = field(
default=5,
- metadata={"help": "shuffle_every_n_samples."}, )
+ metadata={"help": "shuffle_every_n_samples."},
+ )
interpolation: str = field(
default="lanczos",
- metadata={"help": "interpolation method"}, )
+ metadata={"help": "interpolation method"},
+ )
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
index 042f4f9410724..0ef65c15cac26 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
@@ -22,7 +22,11 @@
from paddle.io import DataLoader
from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
from paddlenlp.trainer.integrations import (
- INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs)
+ INTEGRATION_TO_CALLBACK,
+ TrainerCallback,
+ VisualDLCallback,
+ rewrite_logs,
+)
from paddlenlp.transformers.model_utils import _add_variant
from paddlenlp.utils import profiler
from paddlenlp.utils.log import logger
@@ -58,19 +62,17 @@ def autocast_smart_context_manager(self, args):
custom_black_list=set(custom_black_list),
custom_white_list=set(custom_white_list),
level=args.fp16_opt_level,
- dtype=amp_dtype, )
+ dtype=amp_dtype,
+ )
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
return ctx_manager
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def on_log(self, args, state, control, logs=None, **kwargs):
@@ -78,26 +80,32 @@ def on_log(self, args, state, control, logs=None, **kwargs):
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
with self.autocast_smart_context_manager(args):
max_batch = 4 if args.resolution > 256 else 8
image_logs["reconstruction"] = model.decode_image(
- pixel_values=inputs["pixel_values"], max_batch=max_batch)
+ pixel_values=inputs["pixel_values"], max_batch=max_batch
+ )
image_logs["ddim-samples-1.0"] = model.log_image(
input_ids=inputs["input_ids"],
guidance_scale=1.0,
height=args.resolution,
width=args.resolution,
- max_batch=max_batch, )
+ max_batch=max_batch,
+ )
image_logs["ddim-samples-7.5"] = model.log_image(
input_ids=inputs["input_ids"],
guidance_scale=7.5,
height=args.resolution,
width=args.resolution,
- max_batch=max_batch, )
+ max_batch=max_batch,
+ )
if not state.is_world_process_zero:
return
@@ -110,10 +118,8 @@ def on_log(self, args, state, control, logs=None, **kwargs):
logs["unet_lr"] = base_learning_rate
if model.train_text_encoder:
if args.text_encoder_learning_rate != args.unet_learning_rate:
- logs[
- "unet_lr"] = base_learning_rate * args.unet_learning_rate
- logs["text_encoder_lr"] = (base_learning_rate *
- args.text_encoder_learning_rate)
+ logs["unet_lr"] = base_learning_rate * args.unet_learning_rate
+ logs["text_encoder_lr"] = base_learning_rate * args.text_encoder_learning_rate
else:
logs["text_encoder_lr"] = base_learning_rate
@@ -127,11 +133,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
@@ -172,8 +178,7 @@ def __init__(self, benchmark=True, profiler_options=None):
self.profiler_options = profiler_options
def on_train_begin(self, args, state, control, **kwargs):
- assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
- not args.do_predict)
+ assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
if self.benchmark:
self.reader_cost_avg = AverageStatistical()
@@ -198,8 +203,7 @@ def on_step_end(self, args, state, control, **kwargs):
def on_log(self, args, state, control, logs=None, **kwargs):
if self.benchmark:
if logs is not None and "interval_steps_per_second" in logs:
- self.batch_start = self.batch_start + (
- time.time() - self.maybe_log_save_evaluate_start)
+ self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
ips = logs["interval_steps_per_second"] * args.train_batch_size
avg_batch_cost = 1 / logs["interval_steps_per_second"]
logger.info(
@@ -211,14 +215,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
self.reader_cost_avg.get_average(),
avg_batch_cost,
args.train_batch_size,
- ips, ))
+ ips,
+ )
+ )
self.reader_cost_avg.reset()
def on_epoch_end(self, args, state, control, **kwargs):
if self.benchmark:
train_epoch_cost = time.time() - self.epoch_start
- logger.info("train epoch: %d, epoch_cost: %.5f s" %
- (state.epoch, train_epoch_cost))
+ logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
# register visualdl_with_image
@@ -232,7 +237,9 @@ def __init__(self, **kwargs):
self.add_callback(
BenchmarkCallback(
benchmark=self.args.benchmark,
- profiler_options=self.args.profiler_options, ))
+ profiler_options=self.args.profiler_options,
+ )
+ )
if self.args.benchmark:
if self.args.disable_tqdm:
self.pop_callback(PrinterCallback)
@@ -251,34 +258,27 @@ def get_train_dataloader(self):
self.train_dataset,
batch_size=self.args.train_batch_size,
num_workers=self.args.dataloader_num_workers,
- worker_init_fn=worker_init_fn, )
+ worker_init_fn=worker_init_fn,
+ )
else:
return super().get_train_dataloader()
- def _save(self,
- output_dir=None,
- state_dict=None,
- merge_tensor_parallel=False):
+ def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
if self.args.only_save_updated_model:
unwraped_model = unwrap_model(self.model)
logger.info(f"Saving unet checkpoint to {output_dir}/unet")
- unwraped_model.unet.save_pretrained(
- os.path.join(output_dir, "unet"))
+ unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"))
if unwraped_model.use_ema:
logger.info(f"Saving ema unet checkpoint to {output_dir}/unet")
with unwraped_model.ema_scope():
- unwraped_model.unet.save_pretrained(
- os.path.join(output_dir, "unet"), variant="ema")
+ unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema")
if unwraped_model.train_text_encoder:
- logger.info(
- f"Saving text encoder checkpoint to {output_dir}/text_encoder"
- )
- unwraped_model.text_encoder.save_pretrained(
- os.path.join(output_dir, "text_encoder"))
+ logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder")
+ unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder"))
else:
logger.info(f"Saving model checkpoint to {output_dir}")
if state_dict is None:
@@ -287,10 +287,10 @@ def _save(self,
state_dict,
os.path.join(
output_dir,
- _add_variant(PADDLE_WEIGHTS_NAME,
- self.args.weight_name_suffix), ), )
+ _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix),
+ ),
+ )
if self.args.should_save:
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
- paddle.save(self.args,
- os.path.join(output_dir, TRAINING_ARGS_NAME))
+ paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
index 82d71e6c5f816..b41f0b799469f 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
@@ -46,8 +46,7 @@ def parse_src(filename):
elif data_source == "laion_aes":
text_json = json.loads(vec[2])
img_b64 = vec[5]
- caption = text_json.get("caption_en",
- text_json.get("blip_caption_en", ""))
+ caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
else:
_, captions, _, _, _, img_b64 = vec[:6]
caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -77,23 +76,26 @@ def _get_param(self, img, output_size):
class TextImagePair(IterableDataset):
def __init__(
- self,
- file_list,
- size,
- num_records,
- image_processing=None,
- buffer_size=1000,
- shuffle_every_n_samples=5,
- interpolation="lanczos",
- tokenizer=None, ):
+ self,
+ file_list,
+ size,
+ num_records,
+ image_processing=None,
+ buffer_size=1000,
+ shuffle_every_n_samples=5,
+ interpolation="lanczos",
+ tokenizer=None,
+ ):
self.size = size
if image_processing is None:
- self.image_processing = transforms.Compose([
- transforms.Resize(int(size / 0.9), interpolation),
- RandomCrop(size),
- transforms.ToTensor(),
- transforms.Normalize(0.5, 0.5),
- ])
+ self.image_processing = transforms.Compose(
+ [
+ transforms.Resize(int(size / 0.9), interpolation),
+ RandomCrop(size),
+ transforms.ToTensor(),
+ transforms.Normalize(0.5, 0.5),
+ ]
+ )
else:
self.image_processing = image_processing
self.text_processing = lambda caption: tokenizer(
@@ -101,7 +103,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids[0]
+ return_tensors="pd",
+ ).input_ids[0]
self.file_list = []
file_weights = []
with open(file_list, "r") as f:
@@ -122,19 +125,14 @@ def __init__(
file_weights = file_weights / file_weight_sum
print(f"sample weights of files: {file_weights}")
self.file_weights_cumsum = np.cumsum(file_weights)
- self.file_weights_cumsum = np.concatenate(
- [[0.0], self.file_weights_cumsum])
+ self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
else:
print("sample each file list with same probabiliy")
self.file_weights_cumsum = None
self.num_records = num_records
- self.file_ids = [
- np.arange(len(filelist)) for filelist in self.file_list
- ]
- print(
- f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
- )
+ self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+ print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
self.buffer_size = buffer_size
self.shuffle_every_n_samples = shuffle_every_n_samples
@@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames):
random.shuffle(file_ids)
for i in file_ids:
filename = filenames[i].strip("\n")
- with gzip.open(filename,
- "rb") if filename.endswith(".gz") else open(
- filename, "rb") as f:
+ with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
# retry = 0
while True:
line = f.readline()
@@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames):
if w < self.size or h < self.size:
continue
yield {
- "pixel_values":
- self.image_processing(data["image"]),
- "input_ids":
- self.text_processing(data["caption"]),
+ "pixel_values": self.image_processing(data["image"]),
+ "input_ids": self.text_processing(data["caption"]),
}
def random_load_from_multi_dataset(self):
- print(
- f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
- )
+ print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
sample_loader_per_dataset = [
- iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
- for i in range(len(self.file_ids))
+ iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
]
while True:
@@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self):
else:
rand_num = random.random()
for i in range(len(self.file_list)):
- if (self.file_weights_cumsum[i] <= rand_num <
- self.file_weights_cumsum[i + 1]):
+ if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
break
sample_loader = sample_loader_per_dataset[i]
# debug
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
index a50d56e2b5b11..a7afb1ddf6c41 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
@@ -17,8 +17,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
-from diffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- UNet2DConditionModel)
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, UNet2DConditionModel
from transformers import AutoTokenizer, CLIPTextModel
from transformers.utils.logging import get_logger
@@ -35,9 +34,8 @@ def __init__(self, model, decay=0.9999, use_num_upates=True):
self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32))
self.register_buffer(
"num_updates",
- torch.tensor(
- 0, dtype=torch.int) if use_num_upates else torch.tensor(
- -1, dtype=torch.int), )
+ torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int),
+ )
for name, p in model.named_parameters():
if p.requires_grad:
@@ -53,8 +51,7 @@ def forward(self, model):
if self.num_updates >= 0:
self.num_updates += 1
- decay = min(self.decay,
- (1 + self.num_updates) / (10 + self.num_updates))
+ decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
one_minus_decay = 1.0 - decay
@@ -65,10 +62,8 @@ def forward(self, model):
for key in m_param:
if m_param[key].requires_grad:
sname = self.m_name2s_name[key]
- shadow_params[sname] = shadow_params[sname].type_as(m_param[
- key])
- shadow_params[sname].sub_(
- one_minus_decay * (shadow_params[sname] - m_param[key]))
+ shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+ shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
else:
assert key not in self.m_name2s_name
@@ -77,8 +72,7 @@ def copy_to(self, model):
shadow_params = dict(self.named_buffers())
for key in m_param:
if m_param[key].requires_grad:
- m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]]
- .data)
+ m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
else:
assert key not in self.m_name2s_name
@@ -89,9 +83,7 @@ def store(self, parameters):
parameters: Iterable of `torch.nn.Parameter`; the parameters to be
temporarily stored.
"""
- self.collected_params = [
- param.detach().cpu().clone() for param in parameters
- ]
+ self.collected_params = [param.detach().cpu().clone() for param in parameters]
def restore(self, parameters):
"""
@@ -113,19 +105,26 @@ class StableDiffusionModel(nn.Module):
def __init__(self, model_args):
super().__init__()
self.model_args = model_args
- tokenizer_name_or_path = (model_args.tokenizer_name
- if model_args.tokenizer_name is not None else
- model_args.pretrained_model_name_or_path)
- vae_name_or_path = (model_args.vae_name_or_path
- if model_args.vae_name_or_path is not None else
- model_args.pretrained_model_name_or_path)
+ tokenizer_name_or_path = (
+ model_args.tokenizer_name
+ if model_args.tokenizer_name is not None
+ else model_args.pretrained_model_name_or_path
+ )
+ vae_name_or_path = (
+ model_args.vae_name_or_path
+ if model_args.vae_name_or_path is not None
+ else model_args.pretrained_model_name_or_path
+ )
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.text_encoder_name_or_path is not None else
- model_args.pretrained_model_name_or_path)
- unet_name_or_path = (model_args.unet_name_or_path
- if model_args.unet_name_or_path is not None else
- model_args.pretrained_model_name_or_path)
+ if model_args.text_encoder_name_or_path is not None
+ else model_args.pretrained_model_name_or_path
+ )
+ unet_name_or_path = (
+ model_args.unet_name_or_path
+ if model_args.unet_name_or_path is not None
+ else model_args.pretrained_model_name_or_path
+ )
# init model and tokenizer
tokenizer_kwargs = {}
if model_args.model_max_length is not None:
@@ -134,14 +133,12 @@ def __init__(self, model_args):
tokenizer_name_or_path,
**tokenizer_kwargs,
subfolder="tokenizer",
- use_fast=False, )
- self.vae = AutoencoderKL.from_pretrained(
- vae_name_or_path, subfolder="vae")
- self.text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_name_or_path, subfolder="text_encoder")
+ use_fast=False,
+ )
+ self.vae = AutoencoderKL.from_pretrained(vae_name_or_path, subfolder="vae")
+ self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path, subfolder="text_encoder")
try:
- self.unet = UNet2DConditionModel.from_pretrained(
- unet_name_or_path, subfolder="unet")
+ self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path, subfolder="unet")
except Exception:
self.unet = UNet2DConditionModel.from_config(unet_name_or_path)
logger.info("Init unet model from scratch!")
@@ -166,9 +163,9 @@ def __init__(self, model_args):
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
- prediction_type=self.model_args.prediction_type, )
- self.register_buffer("alphas_cumprod",
- self.noise_scheduler.alphas_cumprod)
+ prediction_type=self.model_args.prediction_type,
+ )
+ self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
self.eval_scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
@@ -177,7 +174,8 @@ def __init__(self, model_args):
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1,
- prediction_type=self.model_args.prediction_type, )
+ prediction_type=self.model_args.prediction_type,
+ )
self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps)
self.use_ema = False
self.model_ema = None
@@ -187,25 +185,22 @@ def compute_snr(self, timesteps):
Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
"""
sqrt_alphas_cumprod = self.alphas_cumprod**0.5
- sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5
+ sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5
# Expand the tensors.
# Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
- sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(
- device=timesteps.device)[timesteps].float()
+ sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
- device=timesteps.device)[timesteps].float()
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
- None]
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
# Compute SNR.
- snr = (alpha / sigma)**2
+ snr = (alpha / sigma) ** 2
return snr
def forward(self, input_ids=None, pixel_values=None, **kwargs):
@@ -220,17 +215,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
noise += self.model_args.noise_offset * torch.randn(
(latents.shape[0], latents.shape[1], 1, 1),
dtype=noise.dtype,
- device=noise.device, )
+ device=noise.device,
+ )
if self.model_args.input_perturbation:
- new_noise = noise + self.model_args.input_perturbation * torch.randn_like(
- noise)
+ new_noise = noise + self.model_args.input_perturbation * torch.randn_like(noise)
timesteps = torch.randint(
0,
self.noise_scheduler.config.num_train_timesteps,
- (latents.shape[0], ),
+ (latents.shape[0],),
dtype=torch.long,
- device=latents.device, )
+ device=latents.device,
+ )
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
if self.model_args.input_perturbation:
@@ -239,15 +235,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
noisy_latents = self.add_noise(latents, noise, timesteps)
# text encode
- encoder_hidden_states = self.text_encoder(
- input_ids, return_dict=False)[0]
+ encoder_hidden_states = self.text_encoder(input_ids, return_dict=False)[0]
# unet
model_pred = self.unet(
sample=noisy_latents,
timestep=timesteps,
encoder_hidden_states=encoder_hidden_states,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# Get the target for loss depending on the prediction type
if self.model_args.prediction_type == "epsilon":
@@ -255,62 +251,53 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
elif self.model_args.prediction_type == "v_prediction":
target = self.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {self.model_args.prediction_type}")
+ raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}")
# compute loss
if self.model_args.snr_gamma is None:
- loss = (F.mse_loss(
- model_pred.float(), target.float(), reduction="none")
- .mean([1, 2, 3]).mean())
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
else:
# Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
# Since we predict the noise instead of x_0, the original formulation is slightly changed.
# This is discussed in Section 4.2 of the same paper.
snr = self.compute_snr(timesteps)
- mse_loss_weights = (torch.stack(
- [snr, self.model_args.snr_gamma * torch.ones_like(timesteps)],
- dim=1).min(dim=1)[0] / snr)
+ mse_loss_weights = (
+ torch.stack([snr, self.model_args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+ )
# We first calculate the original loss. Then we mean over the non-batch dimensions and
# rebalance the sample-wise losses with their respective loss weights.
# Finally, we take the mean of the rebalanced loss.
- loss = F.mse_loss(
- model_pred.float(), target.float(), reduction="none")
- loss = loss.mean(
- dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
loss = loss.mean()
return loss
def add_noise(
- self,
- original_samples: torch.Tensor,
- noise: torch.Tensor,
- timesteps: torch.Tensor, ) -> torch.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ self,
+ original_samples: torch.Tensor,
+ noise: torch.Tensor,
+ timesteps: torch.Tensor,
+ ) -> torch.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
- def get_velocity(self,
- sample: torch.Tensor,
- noise: torch.Tensor,
- timesteps: torch.Tensor) -> torch.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -350,20 +337,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs):
@torch.no_grad()
def log_image(
- self,
- input_ids=None,
- height=256,
- width=256,
- eta=0.0,
- guidance_scale=7.5,
- max_batch=8,
- **kwargs, ):
+ self,
+ input_ids=None,
+ height=256,
+ width=256,
+ eta=0.0,
+ guidance_scale=7.5,
+ max_batch=8,
+ **kwargs,
+ ):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log max_batch image
if input_ids.shape[0] > max_batch:
input_ids = input_ids[:max_batch]
@@ -376,44 +362,40 @@ def log_image(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pt", )
+ return_tensors="pt",
+ )
uncond_embeddings = self.text_encoder(
uncond_input.input_ids.to(device=input_ids.device),
- return_dict=False, )[0]
- text_embeddings = torch.cat(
- [uncond_embeddings, text_embeddings], dim=0)
-
- latents = torch.randn((
- input_ids.shape[0],
- self.unet.config.in_channels,
- height // 8,
- width // 8, )).to(device=input_ids.device)
+ return_dict=False,
+ )[0]
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings], dim=0)
+
+ latents = torch.randn(
+ (
+ input_ids.shape[0],
+ self.unet.config.in_channels,
+ height // 8,
+ width // 8,
+ )
+ ).to(device=input_ids.device)
latents = latents * self.eval_scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for t in self.eval_scheduler.timesteps:
- latent_model_input = (torch.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = torch.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=text_embeddings,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- latents = self.eval_scheduler.step(
- noise_pred,
- t,
- latents,
- **extra_step_kwargs,
- return_dict=False)[0]
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1).permute(0, 2, 3, 1) * 255.0
@@ -422,8 +404,7 @@ def log_image(
def set_recompute(self, use_recompute=False):
if use_recompute:
self.unet.enable_gradient_checkpointing()
- if self.model_args.train_text_encoder and hasattr(
- self.text_encoder, "gradient_checkpointing_enable"):
+ if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"):
self.text_encoder.gradient_checkpointing_enable()
def gradient_checkpointing_enable(self):
@@ -433,17 +414,15 @@ def set_xformers(self, use_xformers=False):
if use_xformers:
try:
self.unet.enable_xformers_memory_efficient_attention()
- if hasattr(self.vae,
- "enable_xformers_memory_efficient_attention"):
+ if hasattr(self.vae, "enable_xformers_memory_efficient_attention"):
self.vae.enable_xformers_memory_efficient_attention()
- if hasattr(self.text_encoder,
- "enable_xformers_memory_efficient_attention"):
- self.text_encoder.enable_xformers_memory_efficient_attention(
- )
+ if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"):
+ self.text_encoder.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop torchtorch is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
def set_ema(self, use_ema=False):
self.use_ema = use_ema
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
index 4efe98bed8a65..b49d994418a77 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
@@ -46,63 +46,58 @@ def str2bool(v):
if not str2bool(os.getenv("FLAG_SDP", "True")):
if hasattr(torch.nn.functional, "scaled_dot_product_attention"):
- torch.nn.functional.scaled_dot_product_attention_ = (
- torch.nn.functional.scaled_dot_product_attention)
+ torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
del torch.nn.functional.scaled_dot_product_attention
- print(
- "Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement."
- )
+ print("Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement.")
@dataclass
class SDTrainingArguments(TrainingArguments):
- image_logging_steps: int = field(
- default=1000, metadata={"help": "Log image every X steps."})
+ image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."})
recompute: bool = field(
default=False,
- metadata={"help": "Whether or not run recompute."}, )
+ metadata={"help": "Whether or not run recompute."},
+ )
benchmark: bool = field(
default=False,
- metadata={"help": "Whether or not run benchmark."}, )
+ metadata={"help": "Whether or not run benchmark."},
+ )
report_to: Optional[List[str]] = field(
default_factory=lambda: ["custom_visualdl"],
- metadata={
- "help":
- "The list of integrations to report the results and logs to."
- }, )
+ metadata={"help": "The list of integrations to report the results and logs to."},
+ )
resolution: int = field(
default=512,
metadata={
- "help":
- "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
- }, )
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+ },
+ )
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable_xformers_memory_efficient_attention."})
+ default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+ )
only_save_updated_model: bool = field(
- default=True,
- metadata={"help": "Whether or not save only_save_updated_model"})
+ default=True, metadata={"help": "Whether or not save only_save_updated_model"}
+ )
log_level: str = field(
default="info",
- metadata={"help": "log_level."}, )
+ metadata={"help": "log_level."},
+ )
def __post_init__(self):
super().__post_init__()
self.image_logging_steps = (
- (math.ceil(self.image_logging_steps / self.logging_steps) *
- self.logging_steps) if self.image_logging_steps > 0 else -1)
- self.use_ema = str2bool(os.getenv("FLAG_USE_EMA",
- "False")) or self.use_ema
+ (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps)
+ if self.image_logging_steps > 0
+ else -1
+ )
+ self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema
self.enable_xformers_memory_efficient_attention = (
- str2bool(os.getenv("FLAG_XFORMERS", "False")) or
- self.enable_xformers_memory_efficient_attention)
- self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or
- self.recompute)
+ str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention
+ )
+ self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute
self.gradient_checkpointing = self.gradient_checkpointing or self.recompute
- self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or
- self.benchmark)
+ self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark
def print_config(self, args=None, key=""):
"""
@@ -115,8 +110,7 @@ def print_config(self, args=None, key=""):
logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
logger.info("{:30}: {}".format("torch version", torch.__version__))
- logger.info("{:30}: {}".format("torch commit id",
- torch.version.git_version))
+ logger.info("{:30}: {}".format("torch commit id", torch.version.git_version))
for a in dir(args):
if a[:2] != "__": # don't print double underscore methods
@@ -129,45 +123,34 @@ def print_config(self, args=None, key=""):
@dataclass
class SDModelArguments:
- vae_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "vae_name_or_path"})
- text_encoder_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "text_encoder_name_or_path"})
- unet_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "unet_name_or_path"})
+ vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"})
+ text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+ unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"})
tokenizer_name: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"
- }, )
+ metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"},
+ )
pretrained_model_name_or_path: str = field(
default="CompVis/stable-diffusion-v1-4",
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
- model_max_length: int = field(
- default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
+ model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
prediction_type: str = field(
default="epsilon",
metadata={
- "help":
- "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
- }, )
- num_inference_steps: int = field(
- default=50, metadata={"help": "num_inference_steps"})
- train_text_encoder: bool = field(
- default=False, metadata={"help": "Whether or not train text encoder"})
-
- noise_offset: float = field(
- default=0, metadata={"help": "The scale of noise offset."})
+ "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+ },
+ )
+ num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+ train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"})
+
+ noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."})
snr_gamma: Optional[float] = field(
default=None,
metadata={
- "help":
- "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
- }, )
+ "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
+ },
+ )
input_perturbation: Optional[float] = field(
default=0,
metadata={"help": "The scale of input perturbation. Recommended 0.1."},
@@ -182,14 +165,18 @@ class SDDataArguments:
file_list: str = field(
default="./data/filelist/train.filelist.list",
- metadata={"help": "The name of the file_list."}, )
+ metadata={"help": "The name of the file_list."},
+ )
num_records: int = field(default=10000000, metadata={"help": "num_records"})
buffer_size: int = field(
default=100,
- metadata={"help": "Buffer size"}, )
+ metadata={"help": "Buffer size"},
+ )
shuffle_every_n_samples: int = field(
default=5,
- metadata={"help": "shuffle_every_n_samples."}, )
+ metadata={"help": "shuffle_every_n_samples."},
+ )
interpolation: str = field(
default="lanczos",
- metadata={"help": "interpolation method"}, )
+ metadata={"help": "interpolation method"},
+ )
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
index 6420971caadf8..5338a0c72d142 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
@@ -29,12 +29,13 @@
def on_log(
- self,
- args: TrainingArguments,
- state: TrainerState,
- control: TrainerControl,
- logs,
- **kwargs, ):
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ logs,
+ **kwargs,
+):
control.should_log = False
return self.call_event("on_log", args, state, control, logs=logs, **kwargs)
@@ -64,9 +65,7 @@ def __init__(self, vdl_writer=None):
visualdl
has_visualdl = False
if not has_visualdl:
- raise RuntimeError(
- "VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl."
- )
+ raise RuntimeError("VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl.")
if has_visualdl:
try:
from visualdl import LogWriter
@@ -81,8 +80,7 @@ def __init__(self, vdl_writer=None):
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def _init_summary_writer(self, args, log_dir=None):
@@ -108,34 +106,38 @@ def on_train_begin(self, args, state, control, **kwargs):
self.vdl_writer.add_text("model_config", model_config_json)
if hasattr(self.vdl_writer, "add_hparams"):
- self.vdl_writer.add_hparams(
- args.to_sanitized_dict(), metrics_list=[])
+ self.vdl_writer.add_hparams(args.to_sanitized_dict(), metrics_list=[])
def on_log(self, args, state, control, logs=None, **kwargs):
# log image on each node
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
max_batch = 4 if args.resolution > 256 else 8
image_logs["reconstruction"] = model.decode_image(
- pixel_values=inputs["pixel_values"].to(args.device),
- max_batch=max_batch)
+ pixel_values=inputs["pixel_values"].to(args.device), max_batch=max_batch
+ )
image_logs["ddim-samples-1.0"] = model.log_image(
input_ids=inputs["input_ids"].to(args.device),
guidance_scale=1.0,
height=args.resolution,
width=args.resolution,
- max_batch=max_batch, )
+ max_batch=max_batch,
+ )
image_logs["ddim-samples-7.5"] = model.log_image(
input_ids=inputs["input_ids"].to(args.device),
guidance_scale=7.5,
height=args.resolution,
width=args.resolution,
- max_batch=max_batch, )
+ max_batch=max_batch,
+ )
if not state.is_world_process_zero:
return
@@ -153,11 +155,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
def on_train_end(self, args, state, control, **kwargs):
@@ -202,8 +204,7 @@ def __init__(self, benchmark=True, **kwargs):
self.benchmark = benchmark
def on_train_begin(self, args, state, control, **kwargs):
- assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
- not args.do_predict)
+ assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
if self.benchmark:
self.reader_cost_avg = AverageStatistical()
@@ -225,8 +226,7 @@ def on_step_end(self, args, state, control, **kwargs):
def on_log(self, args, state, control, logs=None, **kwargs):
if self.benchmark:
if logs is not None and "interval_steps_per_second" in logs:
- self.batch_start = self.batch_start + (
- time.time() - self.maybe_log_save_evaluate_start)
+ self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
ips = logs["interval_steps_per_second"] * args.train_batch_size
avg_batch_cost = 1 / logs["interval_steps_per_second"]
logger.info(
@@ -238,14 +238,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
self.reader_cost_avg.get_average(),
avg_batch_cost,
args.train_batch_size,
- ips, ))
+ ips,
+ )
+ )
self.reader_cost_avg.reset()
def on_epoch_end(self, args, state, control, **kwargs):
if self.benchmark:
train_epoch_cost = time.time() - self.epoch_start
- logger.info("train epoch: %d, epoch_cost: %.5f s" %
- (state.epoch, train_epoch_cost))
+ logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
# register visualdl_with_image
@@ -280,22 +281,22 @@ def get_train_dataloader(self):
self.train_dataset,
batch_size=self._train_batch_size,
num_workers=self.args.dataloader_num_workers,
- worker_init_fn=None
- if self.args.world_size <= 1 else worker_init_fn, )
+ worker_init_fn=None if self.args.world_size <= 1 else worker_init_fn,
+ )
else:
return super().get_train_dataloader()
def _inner_training_loop(
- self,
- batch_size=None,
- args=None,
- resume_from_checkpoint=None,
- trial=None,
- ignore_keys_for_eval=None, ):
+ self,
+ batch_size=None,
+ args=None,
+ resume_from_checkpoint=None,
+ trial=None,
+ ignore_keys_for_eval=None,
+ ):
self.accelerator.free_memory()
self._train_batch_size = batch_size
- logger.debug(
- f"Currently training with a batch size of: {self._train_batch_size}")
+ logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
# Data loader and number of training steps
train_dataloader = self.get_train_dataloader()
@@ -303,32 +304,27 @@ def _inner_training_loop(
# number of training epochs: num_train_epochs
# number of training steps per epoch: num_update_steps_per_epoch
# total number of training steps to execute: max_steps
- total_train_batch_size = (args.train_batch_size *
- args.gradient_accumulation_steps *
- args.world_size)
+ total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
len_dataloader = None
if has_length(train_dataloader):
len_dataloader = len(train_dataloader)
- num_update_steps_per_epoch = (len_dataloader //
- args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
num_examples = self.num_examples(train_dataloader)
if args.max_steps > 0:
max_steps = args.max_steps
num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
- args.max_steps % num_update_steps_per_epoch > 0)
+ args.max_steps % num_update_steps_per_epoch > 0
+ )
# May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
# the best we can do.
num_train_samples = args.max_steps * total_train_batch_size
else:
- max_steps = math.ceil(args.num_train_epochs *
- num_update_steps_per_epoch)
+ max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
num_train_epochs = math.ceil(args.num_train_epochs)
- num_train_samples = (self.num_examples(train_dataloader) *
- args.num_train_epochs)
- elif (args.max_steps >
- 0): # Rely on max_steps when dataloader does not have a working size
+ num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+ elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size
max_steps = args.max_steps
# Setting a very large number of epochs so we go as many times as necessary over the iterator.
num_train_epochs = sys.maxsize
@@ -338,7 +334,8 @@ def _inner_training_loop(
else:
raise ValueError(
"args.max_steps must be set to a positive value if dataloader does not have a length, was"
- f" {args.max_steps}")
+ f" {args.max_steps}"
+ )
# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps and args.logging_steps < 1:
@@ -354,18 +351,20 @@ def _inner_training_loop(
# references registered here no longer work on other gpus, breaking the module
raise ValueError(
"Currently --debug underflow_overflow is not supported under DP. Please use DDP"
- " (torch.distributed.launch).")
+ " (torch.distributed.launch)."
+ )
else:
debug_overflow = DebugUnderflowOverflow(self.model) # noqa
delay_optimizer_creation = (
- self.sharded_ddp is not None and
- self.sharded_ddp != ShardedDDPOption.SIMPLE or
- is_sagemaker_mp_enabled() or self.fsdp is not None)
+ self.sharded_ddp is not None
+ and self.sharded_ddp != ShardedDDPOption.SIMPLE
+ or is_sagemaker_mp_enabled()
+ or self.fsdp is not None
+ )
if self.is_deepspeed_enabled:
- self.optimizer, self.lr_scheduler = deepspeed_init(
- self, num_training_steps=max_steps)
+ self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
if not delay_optimizer_creation:
self.create_optimizer_and_scheduler(num_training_steps=max_steps)
@@ -396,12 +395,12 @@ def _inner_training_loop(
if self.use_apex:
model = self.accelerator.prepare(self.model)
else:
- model, self.optimizer = self.accelerator.prepare(
- self.model, self.optimizer)
+ model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
else:
# to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
- self.model, self.optimizer, self.lr_scheduler)
+ self.model, self.optimizer, self.lr_scheduler
+ )
if self.is_fsdp_enabled:
self.model = model
@@ -417,8 +416,7 @@ def _inner_training_loop(
self._globalstep_last_start_time = time.time()
# deepspeed ckpt loading
if resume_from_checkpoint is not None and self.is_deepspeed_enabled:
- deepspeed_load_checkpoint(self.model_wrapped,
- resume_from_checkpoint)
+ deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
# Check if saved optimizer or scheduler states exist
self._load_optimizer_and_scheduler(resume_from_checkpoint)
@@ -431,19 +429,11 @@ def _inner_training_loop(
logger.info("***** Running training *****")
logger.info(f" Num examples = {num_examples:,}")
logger.info(f" Num Epochs = {num_train_epochs:,}")
- logger.info(
- f" Instantaneous batch size per device = {self._train_batch_size:,}"
- )
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}"
- )
+ logger.info(f" Instantaneous batch size per device = {self._train_batch_size:,}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {max_steps:,}")
- logger.info(
- f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}"
- )
+ logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
self.state.epoch = 0
start_time = time.time()
@@ -453,24 +443,19 @@ def _inner_training_loop(
# Check if continuing training from a checkpoint
if resume_from_checkpoint is not None and os.path.isfile(
- os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)):
- self.state = TrainerState.load_from_json(
- os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+ os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
+ ):
+ self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
epochs_trained = self.state.global_step // num_update_steps_per_epoch
if not args.ignore_data_skip:
- steps_trained_in_current_epoch = self.state.global_step % (
- num_update_steps_per_epoch)
+ steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
steps_trained_in_current_epoch *= args.gradient_accumulation_steps
else:
steps_trained_in_current_epoch = 0
- logger.info(
- " Continuing training from checkpoint, will skip to saved global_step"
- )
+ logger.info(" Continuing training from checkpoint, will skip to saved global_step")
logger.info(f" Continuing training from epoch {epochs_trained}")
- logger.info(
- f" Continuing training from global step {self.state.global_step}"
- )
+ logger.info(f" Continuing training from global step {self.state.global_step}")
if not args.ignore_data_skip:
if skip_first_batches is None:
logger.info(
@@ -478,18 +463,16 @@ def _inner_training_loop(
f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time,"
" you can install the latest version of Accelerate with `pip install -U accelerate`.You can"
" also add the `--ignore_data_skip` flag to your launch command, but you will resume the"
- " training on data already seen by your model.")
+ " training on data already seen by your model."
+ )
else:
logger.info(
f" Will skip the first {epochs_trained} epochs then the first"
f" {steps_trained_in_current_epoch} batches in the first epoch."
)
- if (self.is_local_process_zero() and not args.disable_tqdm and
- skip_first_batches is None):
- steps_trained_progress_bar = tqdm(
- total=steps_trained_in_current_epoch)
- steps_trained_progress_bar.set_description(
- "Skipping the first batches")
+ if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None:
+ steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+ steps_trained_progress_bar.set_description("Skipping the first batches")
# Update the references
self.callback_handler.model = self.model
@@ -501,9 +484,7 @@ def _inner_training_loop(
# parameter to Train when using DDP.
self.state.trial_name = self.hp_name(self._trial)
if trial is not None:
- assignments = (trial.assignments
- if self.hp_search_backend == HPSearchBackend.SIGOPT
- else trial)
+ assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
self.state.trial_params = hp_params(assignments)
else:
self.state.trial_params = None
@@ -521,15 +502,14 @@ def _inner_training_loop(
self._globalstep_last_logged = self.state.global_step
model.zero_grad()
- self.control = self.callback_handler.on_train_begin(args, self.state,
- self.control)
+ self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
# Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
if not args.ignore_data_skip:
for epoch in range(epochs_trained):
- is_random_sampler = hasattr(
- train_dataloader, "sampler") and isinstance(
- train_dataloader.sampler, RandomSampler)
+ is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
+ train_dataloader.sampler, RandomSampler
+ )
if is_torch_less_than_1_11 or not is_random_sampler:
# We just need to begin an iteration to create the randomization of the sampler.
# That was before PyTorch 1.11 however...
@@ -542,17 +522,13 @@ def _inner_training_loop(
total_batched_samples = 0
for epoch in range(epochs_trained, num_train_epochs):
- if isinstance(train_dataloader, DataLoader) and isinstance(
- train_dataloader.sampler, DistributedSampler):
+ if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
train_dataloader.sampler.set_epoch(epoch)
- elif hasattr(train_dataloader, "dataset") and isinstance(
- train_dataloader.dataset, IterableDatasetShard):
+ elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
train_dataloader.dataset.set_epoch(epoch)
if is_torch_tpu_available():
- parallel_loader = pl.ParallelLoader(
- train_dataloader,
- [args.device]).per_device_loader(args.device)
+ parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
epoch_iterator = parallel_loader
else:
epoch_iterator = train_dataloader
@@ -561,22 +537,20 @@ def _inner_training_loop(
if args.past_index >= 0:
self._past = None
- steps_in_epoch = (len(epoch_iterator)
- if len_dataloader is not None else
- args.max_steps * args.gradient_accumulation_steps)
- self.control = self.callback_handler.on_epoch_begin(
- args, self.state, self.control)
+ steps_in_epoch = (
+ len(epoch_iterator)
+ if len_dataloader is not None
+ else args.max_steps * args.gradient_accumulation_steps
+ )
+ self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
- if (epoch == epochs_trained and
- resume_from_checkpoint is not None and
- steps_trained_in_current_epoch == 0):
+ if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False
steps_skipped = 0
if skip_first_batches is not None and steps_trained_in_current_epoch > 0:
- epoch_iterator = skip_first_batches(
- epoch_iterator, steps_trained_in_current_epoch)
+ epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
steps_skipped = steps_trained_in_current_epoch
steps_trained_in_current_epoch = 0
rng_to_sync = True
@@ -601,18 +575,18 @@ def _inner_training_loop(
steps_trained_progress_bar = None
if step % args.gradient_accumulation_steps == 0:
- self.control = self.callback_handler.on_step_begin(
- args, self.state, self.control)
+ self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
with self.accelerator.accumulate(model):
tr_loss_step = self.training_step(model, inputs)
- if (args.logging_nan_inf_filter and
- not is_torch_tpu_available() and
- (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))):
+ if (
+ args.logging_nan_inf_filter
+ and not is_torch_tpu_available()
+ and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+ ):
# if loss is nan or inf simply add the average of previous logged losses
- tr_loss += tr_loss / (1 + self.state.global_step -
- self._globalstep_last_logged)
+ tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
else:
tr_loss += tr_loss_step
@@ -622,9 +596,10 @@ def _inner_training_loop(
# the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered
# in accelerate
if total_batched_samples % args.gradient_accumulation_steps == 0 or (
- # last step in epoch but step is always smaller than gradient_accumulation_steps
- steps_in_epoch <= args.gradient_accumulation_steps and
- (step + 1) == steps_in_epoch):
+ # last step in epoch but step is always smaller than gradient_accumulation_steps
+ steps_in_epoch <= args.gradient_accumulation_steps
+ and (step + 1) == steps_in_epoch
+ ):
# Gradient clipping
if args.max_grad_norm is not None and args.max_grad_norm > 0:
# deepspeed does its own clipping
@@ -633,10 +608,7 @@ def _inner_training_loop(
# Reduce gradients first for XLA
if is_torch_tpu_available():
gradients = xm._fetch_gradients(self.optimizer)
- xm.all_reduce(
- "sum",
- gradients,
- scale=1.0 / xm.xrt_world_size())
+ xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
# AMP: gradients need unscaling
self.scaler.unscale_(self.optimizer)
@@ -652,11 +624,13 @@ def _inner_training_loop(
# Revert to normal clipping otherwise, handling Apex or full precision
nn.utils.clip_grad_norm_(
amp.master_params(self.optimizer),
- args.max_grad_norm, )
+ args.max_grad_norm,
+ )
else:
self.accelerator.clip_grad_norm_(
model.parameters(),
- args.max_grad_norm, )
+ args.max_grad_norm,
+ )
# Optimizer step
optimizer_was_run = True
@@ -674,22 +648,20 @@ def _inner_training_loop(
optimizer_was_run = scale_before <= scale_after
else:
self.optimizer.step()
- optimizer_was_run = (
- not self.accelerator.optimizer_step_was_skipped)
+ optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
if optimizer_was_run:
# Delay optimizer scheduling until metrics are generated
if not isinstance(
- self.lr_scheduler,
- torch.optim.lr_scheduler.ReduceLROnPlateau, ):
+ self.lr_scheduler,
+ torch.optim.lr_scheduler.ReduceLROnPlateau,
+ ):
self.lr_scheduler.step()
model.zero_grad()
self.state.global_step += 1
- self.state.epoch = (
- epoch + (step + 1 + steps_skipped) / steps_in_epoch)
- self.control = self.callback_handler.on_step_end(
- args, self.state, self.control)
+ self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
+ self.control = self.callback_handler.on_step_end(args, self.state, self.control)
self._maybe_log_save_evaluate(
tr_loss,
@@ -697,10 +669,10 @@ def _inner_training_loop(
trial,
epoch,
ignore_keys_for_eval,
- inputs=inputs, )
+ inputs=inputs,
+ )
else:
- self.control = self.callback_handler.on_substep_end(
- args, self.state, self.control)
+ self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
if self.control.should_epoch_stop or self.control.should_training_stop:
break
@@ -712,15 +684,8 @@ def _inner_training_loop(
)
self.control.should_training_stop = True
- self.control = self.callback_handler.on_epoch_end(args, self.state,
- self.control)
- self._maybe_log_save_evaluate(
- tr_loss,
- model,
- trial,
- epoch,
- ignore_keys_for_eval,
- inputs=inputs)
+ self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+ self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval, inputs=inputs)
if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
if is_torch_tpu_available():
@@ -738,9 +703,7 @@ def _inner_training_loop(
# Clean the state at the end of training
delattr(self, "_past")
- logger.info(
- "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
- )
+ logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
# Wait for everyone to get here so we are sur the model has been saved by process 0.
if is_torch_tpu_available():
@@ -760,7 +723,8 @@ def _inner_training_loop(
"train",
start_time,
num_samples=num_train_samples,
- num_steps=self.state.max_steps, )
+ num_steps=self.state.max_steps,
+ )
self.store_flos()
metrics["total_flos"] = self.state.total_flos
metrics["train_loss"] = train_loss
@@ -772,27 +736,20 @@ def _inner_training_loop(
self.log(metrics)
run_dir = self._get_output_dir(trial)
- checkpoints_sorted = self._sorted_checkpoints(
- use_mtime=False, output_dir=run_dir)
+ checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
- if (self.args.should_save and
- self.state.best_model_checkpoint is not None and
- self.args.save_total_limit == 1):
+ if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
for checkpoint in checkpoints_sorted:
if checkpoint != self.state.best_model_checkpoint:
- logger.info(
- f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit"
- )
+ logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
shutil.rmtree(checkpoint)
- self.control = self.callback_handler.on_train_end(args, self.state,
- self.control)
+ self.control = self.callback_handler.on_train_end(args, self.state, self.control)
return TrainOutput(self.state.global_step, train_loss, metrics)
- def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
- ignore_keys_for_eval, **kwargs):
+ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval, **kwargs):
if self.control.should_log:
if is_torch_tpu_available():
xm.mark_step()
@@ -806,15 +763,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
tr_loss -= tr_loss
logs["loss"] = round(
- tr_loss_scalar /
- (self.state.global_step - self._globalstep_last_logged),
- 4, )
+ tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged),
+ 4,
+ )
logs["learning_rate"] = self._get_learning_rate()
logs["global_step"] = int(self.state.global_step)
- total_train_batch_size = (self.args.train_batch_size *
- self.args.gradient_accumulation_steps *
- self.args.world_size)
+ total_train_batch_size = (
+ self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.world_size
+ )
num_steps = self.state.global_step - self._globalstep_last_logged
self.store_flos()
logs.update(
@@ -822,7 +779,9 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
"interval",
self._globalstep_last_start_time,
num_samples=total_train_batch_size * num_steps,
- num_steps=num_steps, ))
+ num_steps=num_steps,
+ )
+ )
self._total_loss_scalar += tr_loss_scalar
self._globalstep_last_logged = self.state.global_step
@@ -834,20 +793,19 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
if self.control.should_evaluate:
if isinstance(self.eval_dataset, dict):
metrics = {}
- for eval_dataset_name, eval_dataset in self.eval_dataset.items(
- ):
+ for eval_dataset_name, eval_dataset in self.eval_dataset.items():
dataset_metrics = self.evaluate(
eval_dataset=eval_dataset,
ignore_keys=ignore_keys_for_eval,
- metric_key_prefix=f"eval_{eval_dataset_name}", )
+ metric_key_prefix=f"eval_{eval_dataset_name}",
+ )
metrics.update(dataset_metrics)
else:
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
self._report_to_hp_search(trial, self.state.global_step, metrics)
# Run delayed LR scheduler now that metrics are populated
- if isinstance(self.lr_scheduler,
- torch.optim.lr_scheduler.ReduceLROnPlateau):
+ if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
metric_to_check = self.args.metric_for_best_model
if not metric_to_check.startswith("eval_"):
metric_to_check = f"eval_{metric_to_check}"
@@ -855,17 +813,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
if self.control.should_save:
self._save_checkpoint(model, trial, metrics=metrics)
- self.control = self.callback_handler.on_save(self.args, self.state,
- self.control)
+ self.control = self.callback_handler.on_save(self.args, self.state, self.control)
def log(self, logs: Dict[str, float], **kwargs) -> None:
if self.state.epoch is not None:
logs["epoch"] = round(self.state.epoch, 2)
- output = { ** logs, ** {"step": self.state.global_step}}
+ output = {**logs, **{"step": self.state.global_step}}
self.state.log_history.append(output)
- self.control = self.callback_handler.on_log(
- self.args, self.state, self.control, logs, **kwargs)
+ self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs)
def _save(self, output_dir=None, state_dict=None):
output_dir = output_dir if output_dir is not None else self.args.output_dir
@@ -873,34 +829,26 @@ def _save(self, output_dir=None, state_dict=None):
if self.args.only_save_updated_model:
unwraped_model = unwrap_model(self.model)
logger.info(f"Saving unet checkpoint to {output_dir}/unet")
- unwraped_model.unet.save_pretrained(
- os.path.join(output_dir, "unet"))
+ unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"))
if unwraped_model.use_ema:
logger.info(f"Saving ema unet checkpoint to {output_dir}/unet")
with unwraped_model.ema_scope():
- unwraped_model.unet.save_pretrained(
- os.path.join(output_dir, "unet"), variant="ema")
+ unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema")
if unwraped_model.train_text_encoder:
- logger.info(
- f"Saving text encoder checkpoint to {output_dir}/text_encoder"
- )
- unwraped_model.text_encoder.save_pretrained(
- os.path.join(output_dir, "text_encoder"))
+ logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder")
+ unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder"))
else:
logger.info(f"Saving model checkpoint to {output_dir}")
if state_dict is None:
state_dict = self.model.state_dict()
- logger.info(
- "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
- )
+ logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
if self.args.save_safetensors:
import safetensors
- safetensors.torch.save_file(
- state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
+ safetensors.torch.save_file(state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
else:
torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
index 23507e6820cf0..6cbf69c57a1d4 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
@@ -45,8 +45,7 @@ def parse_src(filename):
elif data_source == "laion_aes":
text_json = json.loads(vec[2])
img_b64 = vec[5]
- caption = text_json.get("caption_en",
- text_json.get("blip_caption_en", ""))
+ caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
else:
_, captions, _, _, _, img_b64 = vec[:6]
caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -63,24 +62,27 @@ def parse_src(filename):
class TextImagePair(IterableDataset):
def __init__(
- self,
- file_list,
- size,
- num_records,
- image_processing=None,
- buffer_size=1000,
- shuffle_every_n_samples=5,
- interpolation="lanczos",
- tokenizer=None, ):
+ self,
+ file_list,
+ size,
+ num_records,
+ image_processing=None,
+ buffer_size=1000,
+ shuffle_every_n_samples=5,
+ interpolation="lanczos",
+ tokenizer=None,
+ ):
self.size = size
assert interpolation == "lanczos"
if image_processing is None:
- self.image_processing = transforms.Compose([
- transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS),
- transforms.RandomCrop(size),
- transforms.ToTensor(),
- transforms.Normalize(0.5, 0.5),
- ])
+ self.image_processing = transforms.Compose(
+ [
+ transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS),
+ transforms.RandomCrop(size),
+ transforms.ToTensor(),
+ transforms.Normalize(0.5, 0.5),
+ ]
+ )
else:
self.image_processing = image_processing
self.text_processing = lambda caption: tokenizer(
@@ -88,7 +90,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="pt", ).input_ids[0]
+ return_tensors="pt",
+ ).input_ids[0]
self.file_list = []
file_weights = []
with open(file_list, "r") as f:
@@ -109,19 +112,14 @@ def __init__(
file_weights = file_weights / file_weight_sum
print(f"sample weights of files: {file_weights}")
self.file_weights_cumsum = np.cumsum(file_weights)
- self.file_weights_cumsum = np.concatenate(
- [[0.0], self.file_weights_cumsum])
+ self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
else:
print("sample each file list with same probabiliy")
self.file_weights_cumsum = None
self.num_records = num_records
- self.file_ids = [
- np.arange(len(filelist)) for filelist in self.file_list
- ]
- print(
- f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
- )
+ self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+ print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
self.buffer_size = buffer_size
self.shuffle_every_n_samples = shuffle_every_n_samples
@@ -130,9 +128,7 @@ def sample_loader(self, file_ids, filenames):
random.shuffle(file_ids)
for i in file_ids:
filename = filenames[i].strip("\n")
- with gzip.open(filename,
- "rb") if filename.endswith(".gz") else open(
- filename, "rb") as f:
+ with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
# retry = 0
while True:
line = f.readline()
@@ -158,19 +154,14 @@ def sample_loader(self, file_ids, filenames):
if w < self.size or h < self.size:
continue
yield {
- "pixel_values":
- self.image_processing(data["image"]),
- "input_ids":
- self.text_processing(data["caption"]),
+ "pixel_values": self.image_processing(data["image"]),
+ "input_ids": self.text_processing(data["caption"]),
}
def random_load_from_multi_dataset(self):
- print(
- f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
- )
+ print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
sample_loader_per_dataset = [
- iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
- for i in range(len(self.file_ids))
+ iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
]
while True:
@@ -179,8 +170,7 @@ def random_load_from_multi_dataset(self):
else:
rand_num = random.random()
for i in range(len(self.file_list)):
- if (self.file_weights_cumsum[i] <= rand_num <
- self.file_weights_cumsum[i + 1]):
+ if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
break
sample_loader = sample_loader_per_dataset[i]
# debug
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
index 0f6ad8874e14d..668ad3aae54a9 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
@@ -13,16 +13,20 @@
# limitations under the License.
import os
-import torch
import transformers
-from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments,
- StableDiffusionModel, StableDiffusionTrainer, TextImagePair)
+from sd import (
+ SDDataArguments,
+ SDModelArguments,
+ SDTrainingArguments,
+ StableDiffusionModel,
+ StableDiffusionTrainer,
+ TextImagePair,
+)
from transformers.trainer import get_last_checkpoint, set_seed
def main():
- parser = transformers.HfArgumentParser(
- (SDModelArguments, SDDataArguments, SDTrainingArguments))
+ parser = transformers.HfArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
log_level = training_args.get_process_log_level()
@@ -37,16 +41,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
print(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -65,13 +67,15 @@ def main():
buffer_size=data_args.buffer_size,
shuffle_every_n_samples=data_args.shuffle_every_n_samples,
interpolation=data_args.interpolation,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
trainer = StableDiffusionTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
checkpoint = None
if training_args.resume_from_checkpoint is not None:
diff --git a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
index 7e0b5e6488085..4f4cd63ceb164 100644
--- a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
@@ -17,13 +17,18 @@
import paddle
from paddlenlp.trainer import PdArgumentParser, get_last_checkpoint, set_seed
from paddlenlp.utils.log import logger
-from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments,
- StableDiffusionModel, StableDiffusionTrainer, TextImagePair)
+from sd import (
+ SDDataArguments,
+ SDModelArguments,
+ SDTrainingArguments,
+ StableDiffusionModel,
+ StableDiffusionTrainer,
+ TextImagePair,
+)
def main():
- parser = PdArgumentParser(
- (SDModelArguments, SDDataArguments, SDTrainingArguments))
+ parser = PdArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
@@ -32,16 +37,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -55,19 +58,16 @@ def main():
model.set_ema(training_args.use_ema)
if training_args.to_static:
- input_ids = paddle.static.InputSpec(
- name="input_ids",
- shape=[-1, model_args.model_max_length],
- dtype="int64")
+ input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
pixel_values = paddle.static.InputSpec(
name="pixel_values",
shape=[-1, 3, training_args.resolution, training_args.resolution],
- dtype="float32", )
+ dtype="float32",
+ )
specs = [input_ids, pixel_values]
paddle.jit.ignore_module([os])
model = paddle.jit.to_static(model, input_spec=specs)
- logger.info("Successfully to apply @to_static with specs: {}".format(
- specs))
+ logger.info("Successfully to apply @to_static with specs: {}".format(specs))
train_dataset = TextImagePair(
file_list=data_args.file_list,
@@ -76,18 +76,19 @@ def main():
buffer_size=data_args.buffer_size,
shuffle_every_n_samples=data_args.shuffle_every_n_samples,
interpolation=data_args.interpolation,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
trainer = StableDiffusionTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
if model_args.train_text_encoder:
if training_args.text_encoder_learning_rate == training_args.unet_learning_rate:
- params_to_train = itertools.chain(model.text_encoder.parameters(),
- model.unet.parameters())
+ params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
else:
# overwrite default learning rate with 1.0
training_args.learning_rate = 1.0
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
index aee1fac6ac23b..857c78b0ae1a9 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
@@ -24,60 +24,46 @@ class ModelArguments:
adapter_config_file: Optional[str] = field(
default="./config/openpose_adapter.json",
- metadata={"help": "adapter_config_file"}, )
- vae_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "pretrained_vae_name_or_path"})
- text_encoder_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "text_encoder_name_or_path"})
- unet_name_or_path: Optional[str] = field(
- default=None, metadata={"help": "unet_encoder_name_or_path"})
+ metadata={"help": "adapter_config_file"},
+ )
+ vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
+ text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+ unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
tokenizer_name: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not the same as model_name"
- }, )
- model_max_length: Optional[int] = field(
- default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
- num_inference_steps: Optional[int] = field(
- default=50, metadata={"help": "num_inference_steps"})
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+ )
+ model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+ num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
pretrained_model_name_or_path: str = field(
default="runwayml/stable-diffusion-v1-5",
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
pretrained_adapter_name_or_path: str = field(
default=None,
metadata={
- "help":
- "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training."
- }, )
- image_logging_steps: Optional[int] = field(
- default=1000, metadata={"help": "Log image every X steps."})
- use_paddle_conv_init: bool = field(
- default=False,
- metadata={"help": "Whether or not use paddle conv2d init."})
- is_ldmbert: bool = field(
- default=False, metadata={"help": "Whether to use ldmbert."})
+ "help": "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training."
+ },
+ )
+ image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
+ use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
+ is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable_xformers_memory_efficient_attention."})
- control_type: Optional[str] = field(
- default="canny", metadata={"help": "The type of control"})
+ default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+ )
+ control_type: Optional[str] = field(default="canny", metadata={"help": "The type of control"})
latents_path: str = field(
default=None,
- metadata={"help": "Path to latents, used for alignment."}, )
- random_alignment: bool = field(
- default=False, metadata={"help": "Whether to align random."})
+ metadata={"help": "Path to latents, used for alignment."},
+ )
+ random_alignment: bool = field(default=False, metadata={"help": "Whether to align random."})
timestep_sample_schedule: Optional[str] = field(
default="linear",
metadata={
- "help":
- "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']."
- }, )
+ "help": "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']."
+ },
+ )
@dataclass
@@ -88,26 +74,29 @@ class DataArguments:
file_list: str = field(
default="./data/filelist/train.filelist.list",
- metadata={"help": "The name of the file_list."}, )
+ metadata={"help": "The name of the file_list."},
+ )
resolution: int = field(
default=512,
metadata={
- "help":
- "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
- }, )
+ "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+ },
+ )
num_records: int = field(default=10000000, metadata={"help": "num_records"})
buffer_size: int = field(
default=100,
- metadata={"help": "Buffer size"}, )
+ metadata={"help": "Buffer size"},
+ )
shuffle_every_n_samples: int = field(
default=5,
- metadata={"help": "shuffle_every_n_samples."}, )
+ metadata={"help": "shuffle_every_n_samples."},
+ )
data_format: str = field(
default="default",
metadata={
- "help":
- "The data format, must be 'default' or 'img2img'. The img2img format directly provides control image."
- }, )
+ "help": "The data format, must be 'default' or 'img2img'. The img2img format directly provides control image."
+ },
+ )
@dataclass
@@ -116,45 +105,28 @@ class GenerateArguments:
Arguments pertaining to specify the model generation settings.
"""
- use_controlnet: bool = field(
- default=False, metadata={"help": "Whether or not use text condition"})
- use_dumpy_dataset: bool = field(
- default=False, metadata={"help": "Whether or not use dummpy dataset"})
- adapter_model_name_or_path: str = field(
- default=None, metadata={"help": "adapter model name or path."})
- sd_model_name_or_path: str = field(
- default=None, metadata={"help": "sd model name or path."})
- file: str = field(
- default="data/test.openpose.filelist", metadata={"help": "eval file."})
+ use_controlnet: bool = field(default=False, metadata={"help": "Whether or not use text condition"})
+ use_dumpy_dataset: bool = field(default=False, metadata={"help": "Whether or not use dummpy dataset"})
+ adapter_model_name_or_path: str = field(default=None, metadata={"help": "adapter model name or path."})
+ sd_model_name_or_path: str = field(default=None, metadata={"help": "sd model name or path."})
+ file: str = field(default="data/test.openpose.filelist", metadata={"help": "eval file."})
seed: int = field(default=42, metadata={"help": "random seed."})
scheduler_type: str = field(
default="ddim",
- metadata={
- "help":
- "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"
- }, )
+ metadata={"help": "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"},
+ )
device: str = field(default="gpu", metadata={"help": "device"})
batch_size: int = field(default=16, metadata={"help": "batch_size"})
- num_inference_steps: int = field(
- default=50, metadata={"help": "num_inference_steps"})
- save_path: str = field(
- default="output/adapter/",
- metadata={"help": "Path to the output file."})
- guidance_scales: str = field(
- default_factory=lambda: [5, 7, 9],
- metadata={"help": "guidance_scales list."})
+ num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+ save_path: str = field(default="output/adapter/", metadata={"help": "Path to the output file."})
+ guidance_scales: str = field(default_factory=lambda: [5, 7, 9], metadata={"help": "guidance_scales list."})
height: int = field(default=512, metadata={"help": "height."})
width: int = field(default=512, metadata={"help": "width."})
- max_generation_limits: int = field(
- default=1000, metadata={"help": "max generation limits."})
- use_text_cond: bool = field(
- default=True, metadata={"help": "Whether or not use text condition"})
+ max_generation_limits: int = field(default=1000, metadata={"help": "max generation limits."})
+ use_text_cond: bool = field(default=True, metadata={"help": "Whether or not use text condition"})
use_default_neg_text_cond: bool = field(
default=True,
- metadata={
- "help": "Whether or not use default negative text condition"
- }, )
- generate_data_format: str = field(
- default="img2img", metadata={"help": "Generate data format."})
- generate_control_image_processor_type: str = field(
- default="openpose", metadata={"help": "Generate data format."})
+ metadata={"help": "Whether or not use default negative text condition"},
+ )
+ generate_data_format: str = field(default="img2img", metadata={"help": "Generate data format."})
+ generate_control_image_processor_type: str = field(default="openpose", metadata={"help": "Generate data format."})
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
index a6151bf307d1c..b7ff85077b613 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
@@ -20,8 +20,11 @@
import paddle.amp.auto_cast as autocast
from paddle.io import DataLoader
from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
- VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+ INTEGRATION_TO_CALLBACK,
+ VisualDLCallback,
+ rewrite_logs,
+)
from paddlenlp.utils.log import logger
from ppdiffusers.training_utils import unwrap_model
@@ -40,19 +43,17 @@ def autocast_smart_context_manager(self, args):
"c_softmax_with_cross_entropy",
],
level=args.fp16_opt_level,
- dtype=amp_dtype, )
+ dtype=amp_dtype,
+ )
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
return ctx_manager
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def on_log(self, args, state, control, logs=None, **kwargs):
@@ -63,20 +64,22 @@ def on_log(self, args, state, control, logs=None, **kwargs):
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
with self.autocast_smart_context_manager(args):
- image_logs["reconstruction"] = model.decode_image(
- pixel_values=inputs["pixel_values"])
- image_logs["control"] = model.decode_control_image(
- adapter_cond=inputs["adapter_cond"])
+ image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
+ image_logs["control"] = model.decode_control_image(adapter_cond=inputs["adapter_cond"])
image_logs["ddim-samples-9.0"] = model.log_image(
input_ids=inputs["input_ids"],
adapter_cond=inputs["adapter_cond"],
guidance_scale=9.0,
height=args.resolution,
- width=args.resolution, )
+ width=args.resolution,
+ )
if self.vdl_writer is None:
self._init_summary_writer(args)
@@ -91,11 +94,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
@@ -104,12 +107,9 @@ def on_log(self, args, state, control, logs=None, **kwargs):
def collate_fn(examples):
- pixel_values = paddle.stack(
- [paddle.to_tensor(example["pixel_values"]) for example in examples])
- input_ids = paddle.stack(
- [paddle.to_tensor(example["input_ids"]) for example in examples])
- adapter_cond = paddle.stack(
- [paddle.to_tensor(example["adapter_cond"]) for example in examples])
+ pixel_values = paddle.stack([paddle.to_tensor(example["pixel_values"]) for example in examples])
+ input_ids = paddle.stack([paddle.to_tensor(example["input_ids"]) for example in examples])
+ adapter_cond = paddle.stack([paddle.to_tensor(example["adapter_cond"]) for example in examples])
batch = {
"input_ids": input_ids,
@@ -133,18 +133,16 @@ def get_train_dataloader(self):
batch_size=self.args.train_batch_size,
num_workers=self.args.dataloader_num_workers,
worker_init_fn=worker_init_fn,
- collate_fn=collate_fn, )
+ collate_fn=collate_fn,
+ )
else:
return super().get_train_dataloader()
- def _save(self,
- output_dir=None,
- state_dict=None,
- merge_tensor_parallel=False):
+ def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
super()._save(
output_dir=output_dir,
state_dict=state_dict,
- merge_tensor_parallel=merge_tensor_parallel, )
+ merge_tensor_parallel=merge_tensor_parallel,
+ )
output_dir = output_dir if output_dir is not None else self.args.output_dir
- unwrap_model(self.model).adapter.save_pretrained(
- os.path.join(output_dir, "adapter"))
+ unwrap_model(self.model).adapter.save_pretrained(os.path.join(output_dir, "adapter"))
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
index 5dd1dec076803..e179df14c8f40 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
@@ -45,8 +45,7 @@ def process_data(line, filename, data_format):
control_image_b64str = None
caption = ""
- caption += text_json.get("caption_en",
- text_json.get("blip_caption_en", ""))
+ caption += text_json.get("caption_en", text_json.get("blip_caption_en", ""))
if caption != "":
image_base64 = image_b64str
else:
@@ -65,11 +64,9 @@ def parse_line(line, filename, data_format="default"):
res = process_data(line, filename, data_format)
if res is not None:
image_base64, caption, _id, control_image_base64 = res
- image = Image.open(io.BytesIO(base64.b64decode(
- image_base64))).convert("RGB")
+ image = Image.open(io.BytesIO(base64.b64decode(image_base64))).convert("RGB")
if control_image_base64 is not None:
- image_extract = io.BytesIO(
- base64.b64decode(control_image_base64))
+ image_extract = io.BytesIO(base64.b64decode(control_image_base64))
control_image = Image.open(image_extract).convert("RGB")
control_image = control_image.resize(image.size)
@@ -83,7 +80,8 @@ def parse_line(line, filename, data_format="default"):
(image.size[0] - image.size[1]) // 2,
0,
(image.size[0] + image.size[1]) // 2,
- image.size[1], )
+ image.size[1],
+ )
image = image.crop(crop_size)
if control_image is not None:
control_image = control_image.crop(crop_size)
@@ -95,7 +93,8 @@ def parse_line(line, filename, data_format="default"):
image=image,
caption=caption,
_id=_id,
- control_image=control_image, )
+ control_image=control_image,
+ )
else:
return None
except Exception as e:
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
index 91969cb548b8c..74b3617fb060b 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
@@ -23,11 +23,12 @@
class Fill50kDataset(Dataset):
def __init__(
- self,
- tokenizer,
- file_path="./fill50k",
- do_image_processing=True,
- do_text_processing=True, ):
+ self,
+ tokenizer,
+ file_path="./fill50k",
+ do_image_processing=True,
+ do_text_processing=True,
+ ):
self.tokenizer = tokenizer
self.image_list = []
self.label_list = []
@@ -47,7 +48,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="np", ).input_ids[0]
+ return_tensors="np",
+ ).input_ids[0]
self.do_image_processing = do_image_processing
self.do_text_processing = do_text_processing
@@ -67,13 +69,11 @@ def __getitem__(self, idx):
if self.do_image_processing:
# Normalize source images to [0, 1].
source = source.astype(np.float32) / 255.0
- source = paddle.to_tensor(
- source.transpose([2, 0, 1]), dtype=paddle.float32)
+ source = paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32)
# Normalize target images to [-1, 1].
target = (target.astype(np.float32) / 127.5) - 1.0
- target = paddle.to_tensor(
- target.transpose([2, 0, 1]), dtype=paddle.float32)
+ target = paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32)
if self.text_processing and self.do_text_processing:
input_ids = self.text_processing(prompt)
@@ -84,4 +84,5 @@ def __getitem__(self, idx):
return dict(
input_ids=input_ids,
pixel_values=target,
- adapter_cond=source, )
+ adapter_cond=source,
+ )
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/model.py b/ppdiffusers/examples/t2i-adapter/adapter/model.py
index 2e31f0262f56b..1c9d6f678955e 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/model.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/model.py
@@ -24,9 +24,16 @@
from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
from paddlenlp.utils.log import logger
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- LDMBertModel, T2IAdapter, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ LDMBertModel,
+ T2IAdapter,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
+
# from ppdiffusers.initializer import reset_initialized_parameter
from ppdiffusers.models.ema import LitEma
from ppdiffusers.training_utils import freeze_params
@@ -52,18 +59,20 @@ def __init__(self, model_args):
# init tokenizer
tokenizer_name_or_path = (
model_args.tokenizer_name
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+ )
self.tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name_or_path,
- model_max_length=model_args.model_max_length)
+ tokenizer_name_or_path, model_max_length=model_args.model_max_length
+ )
vae_name = "vqvae" if model_args.is_ldmbert else "vae"
# init vae
vae_name_or_path = (
model_args.vae_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, vae_name))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
+ )
self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
freeze_params(self.vae.parameters())
@@ -72,27 +81,27 @@ def __init__(self, model_args):
if model_args.is_ldmbert:
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "bert"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "bert")
+ )
# init text_encoder
- self.text_encoder = LDMBertModel.from_pretrained(
- text_encoder_name_or_path)
+ self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
else:
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path,
- "text_encoder"))
- self.text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_name_or_path)
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+ )
+ self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
freeze_params(self.text_encoder.parameters())
logger.info("Freeze text_encoder parameters!")
unet_name_or_path = (
model_args.unet_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+ )
self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
@@ -100,44 +109,43 @@ def __init__(self, model_args):
logger.info("Freeze unet parameters!")
if model_args.pretrained_adapter_name_or_path:
- self.adapter = T2IAdapter.from_pretrained(
- model_args.pretrained_adapter_name_or_path)
+ self.adapter = T2IAdapter.from_pretrained(model_args.pretrained_adapter_name_or_path)
else:
- self.adapter = T2IAdapter(
- **read_json(model_args.adapter_config_file))
+ self.adapter = T2IAdapter(**read_json(model_args.adapter_config_file))
self.noise_scheduler = DDPMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
self.eval_scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
self.use_ema = model_args.use_ema
if self.use_ema:
self.model_ema = LitEma(self.adapter)
self.adapter_conditioning_scale = 1.0
- if (model_args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
self.unet.enable_xformers_memory_efficient_attention()
self.adapter.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
self.use_preconfig_latents = False
if model_args.latents_path:
self.use_preconfig_latents = True
- self.register_buffer("preconfig_latents",
- paddle.load(model_args.latents_path))
+ self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path))
self.random_alignment = model_args.random_alignment
self.timestep_sample_schedule = model_args.timestep_sample_schedule
@@ -162,36 +170,29 @@ def on_train_batch_end(self):
def get_time_with_schedule(self, timestep_sample_schedule, bs):
if timestep_sample_schedule == "linear":
- t = paddle.randint(
- low=0,
- high=self.noise_scheduler.num_train_timesteps,
- shape=(bs, )).astype(dtype="int64")
+ t = paddle.randint(low=0, high=self.noise_scheduler.num_train_timesteps, shape=(bs,)).astype(dtype="int64")
elif timestep_sample_schedule == "cosine":
- t = paddle.rand(shape=(bs, ))
- t = paddle.cos(x=np.pi / 2.0 *
- t) * self.noise_scheduler.num_train_timesteps
+ t = paddle.rand(shape=(bs,))
+ t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
t = t.astype(dtype="int64")
elif timestep_sample_schedule == "cubic":
- t = paddle.rand(shape=(bs, ))
+ t = paddle.rand(shape=(bs,))
t = (1 - t**3) * self.noise_scheduler.num_train_timesteps
t = t.astype(dtype="int64")
else:
raise NotImplementedError
- t = paddle.clip(
- x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
+ t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
return t
- def get_time_with_schedule_and_numpy_generator(
- self, timestep_sample_schedule, bs):
+ def get_time_with_schedule_and_numpy_generator(self, timestep_sample_schedule, bs):
if timestep_sample_schedule == "linear":
t = paddle.to_tensor(
- generator.randint(
- 0, self.noise_scheduler.num_train_timesteps, size=(bs, )),
- dtype="int64", )
+ generator.randint(0, self.noise_scheduler.num_train_timesteps, size=(bs,)),
+ dtype="int64",
+ )
elif timestep_sample_schedule == "cosine":
t = paddle.to_tensor(generator.rand(bs))
- t = paddle.cos(x=np.pi / 2.0 *
- t) * self.noise_scheduler.num_train_timesteps
+ t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
t = t.astype(dtype="int64")
elif timestep_sample_schedule == "cubic":
t = paddle.to_tensor(generator.rand(bs))
@@ -199,18 +200,12 @@ def get_time_with_schedule_and_numpy_generator(
t = t.astype(dtype="int64")
else:
raise NotImplementedError
- t = paddle.clip(
- x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
+ t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
return t
- def forward(self,
- input_ids=None,
- pixel_values=None,
- adapter_cond=None,
- **kwargs):
+ def forward(self, input_ids=None, pixel_values=None, adapter_cond=None, **kwargs):
with paddle.no_grad():
- adapter_cond = self.control_image_processor.process_model_forward(
- adapter_cond)
+ adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
self.train()
with paddle.amp.auto_cast(enable=False):
with paddle.no_grad():
@@ -220,15 +215,13 @@ def forward(self,
latents = latents * 0.18215
if self.random_alignment:
timesteps = self.get_time_with_schedule_and_numpy_generator(
- self.timestep_sample_schedule, latents.shape[0])
- noise = paddle.to_tensor(
- generator.randn(*latents.shape), dtype="float32")
+ self.timestep_sample_schedule, latents.shape[0]
+ )
+ noise = paddle.to_tensor(generator.randn(*latents.shape), dtype="float32")
else:
- timesteps = self.get_time_with_schedule(
- self.timestep_sample_schedule, latents.shape[0])
+ timesteps = self.get_time_with_schedule(self.timestep_sample_schedule, latents.shape[0])
noise = paddle.randn(latents.shape)
- noisy_latents = self.noise_scheduler.add_noise(latents, noise,
- timesteps)
+ noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
encoder_hidden_states = self.text_encoder(input_ids)[0]
adapter_state = self.adapter(adapter_cond)
@@ -240,7 +233,8 @@ def forward(self,
noisy_latents,
timestep=timesteps,
encoder_hidden_states=encoder_hidden_states,
- down_block_additional_residuals=adapter_state, ).sample
+ down_block_additional_residuals=adapter_state,
+ ).sample
loss = F.mse_loss(noise_pred, noise, reduction="mean")
return loss
@@ -257,29 +251,25 @@ def decode_image(self, pixel_values=None, **kwargs):
@paddle.no_grad()
def decode_control_image(self, adapter_cond=None, **kwargs):
- adapter_cond = self.control_image_processor.process_model_forward(
- adapter_cond) # (0, 1)
- return (255 * (adapter_cond.transpose(
- [0, 2, 3, 1])).cast("float32").numpy().round())
+ adapter_cond = self.control_image_processor.process_model_forward(adapter_cond) # (0, 1)
+ return 255 * (adapter_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
@paddle.no_grad()
def log_image(
- self,
- input_ids=None,
- adapter_cond=None,
- height=512,
- width=512,
- eta=0.0,
- guidance_scale=9,
- **kwargs, ):
- adapter_cond = self.control_image_processor.process_model_forward(
- adapter_cond)
+ self,
+ input_ids=None,
+ adapter_cond=None,
+ height=512,
+ width=512,
+ eta=0.0,
+ guidance_scale=9,
+ **kwargs,
+ ):
+ adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log 8 image
if input_ids.shape[0] > 4:
input_ids = input_ids[:4]
@@ -293,33 +283,28 @@ def log_image(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings], axis=0)
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
if self.use_preconfig_latents:
latents = self.preconfig_latents
else:
- latents = paddle.randn(
- (input_ids.shape[0], self.unet.in_channels, height // 8,
- width // 8))
+ latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
# ddim donot use this
latents = latents * self.eval_scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for t in self.eval_scheduler.timesteps:
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# ddim donot use this
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
# Adapter predict the noise residual
adapter_state = self.adapter(adapter_cond)
@@ -334,19 +319,16 @@ def log_image(
latent_model_input,
t,
encoder_hidden_states=text_embeddings,
- down_block_additional_residuals=[
- state.clone() for state in adapter_state
- ], ).sample
+ down_block_additional_residuals=[state.clone() for state in adapter_state],
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = 1 / 0.18215 * latents
image = self.vae.decode(latents).sample
@@ -358,7 +340,6 @@ def set_recompute(self, value=False):
def fn(layer):
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
self.adapter.apply(fn)
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
index a523be48b4663..a3d1481c39807 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
@@ -43,25 +43,28 @@ def _get_param(self, img, output_size):
class TextImagePair(IterableDataset):
def __init__(
- self,
- file_list,
- size,
- num_records,
- image_processing=None,
- buffer_size=1000,
- shuffle_every_n_samples=5,
- interpolation="lanczos",
- tokenizer=None,
- control_image_processor=None,
- data_format="default",
- do_image_processing=True, ):
+ self,
+ file_list,
+ size,
+ num_records,
+ image_processing=None,
+ buffer_size=1000,
+ shuffle_every_n_samples=5,
+ interpolation="lanczos",
+ tokenizer=None,
+ control_image_processor=None,
+ data_format="default",
+ do_image_processing=True,
+ ):
self.size = size
self.resize_transform = transforms.Resize(int(size), interpolation)
if image_processing is None:
- self.image_processing = transforms.Compose([
- transforms.ToTensor(), # (0 ~ 1)
- transforms.Normalize(0.5, 0.5), # (-1 ~ 1)
- ])
+ self.image_processing = transforms.Compose(
+ [
+ transforms.ToTensor(), # (0 ~ 1)
+ transforms.Normalize(0.5, 0.5), # (-1 ~ 1)
+ ]
+ )
else:
self.image_processing = image_processing
if tokenizer is not None:
@@ -70,7 +73,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="np", ).input_ids[0]
+ return_tensors="np",
+ ).input_ids[0]
else:
self.text_processing = None
@@ -99,19 +103,14 @@ def __init__(
file_weights = file_weights / file_weight_sum
print(f"sample weights of files: {file_weights}")
self.file_weights_cumsum = np.cumsum(file_weights)
- self.file_weights_cumsum = np.concatenate(
- [[0.0], self.file_weights_cumsum])
+ self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
else:
print("sample each file list with same probabiliy")
self.file_weights_cumsum = None
self.num_records = num_records
- self.file_ids = [
- np.arange(len(filelist)) for filelist in self.file_list
- ]
- print(
- f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
- )
+ self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+ print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
self.buffer_size = buffer_size
self.shuffle_every_n_samples = shuffle_every_n_samples
self.data_format = data_format
@@ -122,9 +121,7 @@ def sample_loader(self, file_ids, filenames):
random.shuffle(file_ids)
for i in file_ids:
filename = filenames[i].strip("\n")
- with gzip.open(filename,
- "rb") if filename.endswith(".gz") else open(
- filename, "rb") as f:
+ with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
# retry = 0
while True:
line = f.readline()
@@ -150,31 +147,26 @@ def sample_loader(self, file_ids, filenames):
control_image = data["control_image"]
if control_image is not None:
- control_image = self.resize_transform(
- control_image)
+ control_image = self.resize_transform(control_image)
else:
control_image = image
out = {
- "pixel_values":
- self.image_processing(image).numpy()
- if self.do_image_processing else image,
- "input_ids":
- self.text_processing(data["caption"])
- if self.text_processing else data["caption"],
- "adapter_cond":
- self.control_image_processor.process_data_load(
- control_image).numpy() if
- self.control_image_processor else control_image,
+ "pixel_values": self.image_processing(image).numpy()
+ if self.do_image_processing
+ else image,
+ "input_ids": self.text_processing(data["caption"])
+ if self.text_processing
+ else data["caption"],
+ "adapter_cond": self.control_image_processor.process_data_load(control_image).numpy()
+ if self.control_image_processor
+ else control_image,
}
yield out
def random_load_from_multi_dataset(self):
- print(
- f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
- )
+ print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
sample_loader_per_dataset = [
- iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
- for i in range(len(self.file_ids))
+ iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
]
while True:
@@ -183,8 +175,7 @@ def random_load_from_multi_dataset(self):
else:
rand_num = random.random()
for i in range(len(self.file_list)):
- if (self.file_weights_cumsum[i] <= rand_num <
- self.file_weights_cumsum[i + 1]):
+ if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
break
sample_loader = sample_loader_per_dataset[i]
yield next(sample_loader)
diff --git a/ppdiffusers/examples/t2i-adapter/generate.py b/ppdiffusers/examples/t2i-adapter/generate.py
index b4afa6609c6eb..1197dc715e704 100644
--- a/ppdiffusers/examples/t2i-adapter/generate.py
+++ b/ppdiffusers/examples/t2i-adapter/generate.py
@@ -17,22 +17,28 @@
import numpy as np
import paddle
-from adapter import (DataArguments, Fill50kDataset, GenerateArguments,
- TextImagePair)
+from adapter import DataArguments, Fill50kDataset, GenerateArguments, TextImagePair
from annotator.canny import CannyDetector
from annotator.util import HWC3
from paddlenlp.trainer import PdArgumentParser
from PIL import Image
from tqdm import tqdm
-from ppdiffusers import (ControlNetModel, DDIMScheduler,
- EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionAdapterPipeline,
- StableDiffusionControlNetPipeline, T2IAdapter)
+from ppdiffusers import (
+ ControlNetModel,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionAdapterPipeline,
+ StableDiffusionControlNetPipeline,
+ T2IAdapter,
+)
DEFAULT_NEGATIVE_PROMPT = (
"longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, "
- "fewer digits, cropped, worst quality, low quality")
+ "fewer digits, cropped, worst quality, low quality"
+)
class CannyProcessor:
@@ -79,31 +85,34 @@ def set_seed(seed: int):
def generate_images(
- use_controlnet=False,
- adapter_model_name_or_path=None,
- sd_model_name_or_path=None,
- batch_size=16,
- test_dataset=None,
- save_path="output",
- guidance_scales=[3, 4, 5, 6, 7, 8],
- num_inference_steps=50,
- scheduler_type="ddim",
- device="gpu",
- max_generation_limits=1000,
- use_text_cond=True,
- use_default_neg_text_cond=True,
- generate_control_image_processor_type=None,
- eta=0.0, ):
+ use_controlnet=False,
+ adapter_model_name_or_path=None,
+ sd_model_name_or_path=None,
+ batch_size=16,
+ test_dataset=None,
+ save_path="output",
+ guidance_scales=[3, 4, 5, 6, 7, 8],
+ num_inference_steps=50,
+ scheduler_type="ddim",
+ device="gpu",
+ max_generation_limits=1000,
+ use_text_cond=True,
+ use_default_neg_text_cond=True,
+ generate_control_image_processor_type=None,
+ eta=0.0,
+):
# set pipe
paddle.set_device(device)
if use_controlnet:
controlnet = ControlNetModel.from_pretrained(adapter_model_name_or_path)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- sd_model_name_or_path, controlnet=controlnet, safety_checker=None)
+ sd_model_name_or_path, controlnet=controlnet, safety_checker=None
+ )
else:
adapter = T2IAdapter.from_pretrained(adapter_model_name_or_path)
pipe = StableDiffusionAdapterPipeline.from_pretrained(
- sd_model_name_or_path, adapter=adapter, safety_checker=None)
+ sd_model_name_or_path, adapter=adapter, safety_checker=None
+ )
pipe.set_progress_bar_config(disable=True)
# set scheduler
@@ -117,17 +126,14 @@ def generate_images(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif scheduler_type == "euler-ancestral":
scheduler = EulerAncestralDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+ )
elif scheduler_type == "ddim":
scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -136,7 +142,8 @@ def generate_images(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
pipe.scheduler = scheduler
@@ -158,24 +165,21 @@ def generate_images(
write_file = open(os.path.join(save_path, "caption.txt"), "w")
i = 0
for data in tqdm(test_dataset):
- if (generate_control_image_processor_type ==
- "canny"): # Canny mode needs to manually process the control image
- data["adapter_cond"] = canny_processor.process_data_load(data[
- "pixel_values"])
+ if (
+ generate_control_image_processor_type == "canny"
+ ): # Canny mode needs to manually process the control image
+ data["adapter_cond"] = canny_processor.process_data_load(data["pixel_values"])
images = pipe(
data["input_ids"] if use_text_cond else "",
- negative_prompt=DEFAULT_NEGATIVE_PROMPT
- if use_default_neg_text_cond else "",
+ negative_prompt=DEFAULT_NEGATIVE_PROMPT if use_default_neg_text_cond else "",
image=data["adapter_cond"],
guidance_scale=float(cfg),
eta=eta,
- num_inference_steps=num_inference_steps, )[0]
- data["adapter_cond"].save(
- os.path.join(cond_save_path, "{:05d}_000.png".format(i)))
- data["pixel_values"].save(
- os.path.join(origin_save_path, "{:05d}_000.png".format(i)))
- write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"]
- .strip() + "\n")
+ num_inference_steps=num_inference_steps,
+ )[0]
+ data["adapter_cond"].save(os.path.join(cond_save_path, "{:05d}_000.png".format(i)))
+ data["pixel_values"].save(os.path.join(origin_save_path, "{:05d}_000.png".format(i)))
+ write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"].strip() + "\n")
for image in images:
path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
image.save(path)
@@ -198,7 +202,8 @@ def generate_images(
tokenizer=None,
file_path=generate_args.file,
do_image_processing=False,
- do_text_processing=False, )
+ do_text_processing=False,
+ )
else:
test_dataset = TextImagePair(
@@ -210,7 +215,8 @@ def generate_images(
interpolation="lanczos",
data_format=generate_args.generate_data_format,
control_image_processor=None,
- do_image_processing=False, )
+ do_image_processing=False,
+ )
generate_images(
use_controlnet=generate_args.use_controlnet,
@@ -226,5 +232,5 @@ def generate_images(
max_generation_limits=generate_args.max_generation_limits,
use_text_cond=generate_args.use_text_cond,
use_default_neg_text_cond=generate_args.use_default_neg_text_cond,
- generate_control_image_processor_type=generate_args.
- generate_control_image_processor_type, )
+ generate_control_image_processor_type=generate_args.generate_control_image_processor_type,
+ )
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
index 758e595e0ae59..01f4839ec21ff 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
@@ -39,8 +39,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -51,11 +50,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
"--output_path",
type=str,
default="paddle_models/sd-v1-4-adapter-color",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
- th_controlnet = DiffusersAdapterNetModel.from_pretrained(
- args.pretrained_model_name_or_path)
+ th_controlnet = DiffusersAdapterNetModel.from_pretrained(args.pretrained_model_name_or_path)
controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
pp_controlnet = PPDiffusersAdapterNetModel.from_config(th_controlnet.config)
pp_controlnet.set_dict(controlnet_state_dict)
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
index 824cb9d41f945..165fb8d562914 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
@@ -42,10 +42,7 @@ def convert_to_paddle(vae_or_unet, dtype="float32"):
@patch_to(paddle.nn.Layer)
-def load_state_dict(self: paddle.nn.Layer,
- state_dict: dict,
- use_structured_name=True,
- strict=True):
+def load_state_dict(self: paddle.nn.Layer, state_dict: dict, use_structured_name=True, strict=True):
orig = self.state_dict()
orig_keys = set([k for k in orig.keys()])
loaded_keys = set([k for k in state_dict.keys()])
@@ -76,29 +73,32 @@ def apply(name):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--orig_t2i_adapter_project_path",
type=str,
default="pytorch/T2I-Adapter",
- help="Path to a torch model parameters file", )
+ help="Path to a torch model parameters file",
+ )
parser.add_argument(
"--orig_t2i_adapter_pretrained_ckpt_path",
type=str,
default="ckpt/t2iadapter_openpose_sd14v1.pth",
- help="Path to a torch model parameters file", )
+ help="Path to a torch model parameters file",
+ )
parser.add_argument(
"--ppdiffusers_t2i_adapter_model_config_path",
type=str,
default="ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json",
- help="Path to a torch model parameters file", )
+ help="Path to a torch model parameters file",
+ )
parser.add_argument(
"--ppdiffusers_t2i_adapter_model_output_path",
type=str,
default="paddle_models/sd-v1-4-adapter-openpose_initialized",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
import os
@@ -113,19 +113,21 @@ def apply(name):
nums_rb=2,
ksize=1,
sk=True,
- use_conv=False, )
+ use_conv=False,
+ )
from ppdiffusers import T2IAdapter as paddle_network
- Paddle_Model = paddle_network(
- **read_json(args.ppdiffusers_t2i_adapter_model_config_path))
+ Paddle_Model = paddle_network(**read_json(args.ppdiffusers_t2i_adapter_model_config_path))
torch_model = Torch_Model
if args.orig_t2i_adapter_pretrained_ckpt_path:
torch_model.load_state_dict(
torch.load(
args.orig_t2i_adapter_pretrained_ckpt_path,
- map_location=torch.device("cpu"), ),
- strict=True, )
+ map_location=torch.device("cpu"),
+ ),
+ strict=True,
+ )
# When orig_t2i_adapter_pretrained_ckpt_path is not specified, the randomly initialized torch weights are stored in orig_t2i_adapter_pretrained_ckpt_path
else:
torch.save(
@@ -133,7 +135,9 @@ def apply(name):
os.path.join(
args.orig_t2i_adapter_project_path,
"ckpt",
- "torch_t2i_model_initialized.pth", ), )
+ "torch_t2i_model_initialized.pth",
+ ),
+ )
torch_model_dict = convert_adapter(torch_model.state_dict())
numpy_state_dict = convert_to_paddle(torch_model_dict)
paddle_model = Paddle_Model
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
index dd6cc4ced4689..45f7f2262e5fd 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
@@ -76,13 +76,15 @@ def convert_adapter_light(old_state_dict):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--output_path",
default=None,
type=str,
required=True,
- help="Path to the store the result checkpoint.", )
+ help="Path to the store the result checkpoint.",
+ )
parser.add_argument(
"--is_adapter_light",
default=False,
diff --git a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
index 432265f92f6db..172b6727c299f 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
@@ -30,60 +30,59 @@
"--dataset_base_name",
type=str,
default="artv4_openpose_test13",
- help="The dataset basename.", )
+ help="The dataset basename.",
+)
parser.add_argument(
"--ids_list_path",
type=str,
default="artv4_openpose_test13_ids.txt",
- help="The ids list path.", )
+ help="The ids list path.",
+)
parser.add_argument(
"--ids_list_path",
type=str,
default="artv4_openpose_test13_ids.txt",
- help="The ids list path.", )
+ help="The ids list path.",
+)
parser.add_argument(
"--source_prompt_list_one_path",
type=str,
default="prompts_artv4_openpose_test1_en_prompts.txt",
- help="The first source prompt list path.", )
+ help="The first source prompt list path.",
+)
parser.add_argument(
"--source_prompt_list_two_path",
type=str,
default="prompts_artv4_openpose_test2_en_prompts.txt",
- help="The second source prompt list path.", )
+ help="The second source prompt list path.",
+)
parser.add_argument(
"--source_prompt_list_three_path",
type=str,
default="prompts_artv4_openpose_test3_en_prompts.txt",
- help="The third source prompt list path.", )
+ help="The third source prompt list path.",
+)
parser.add_argument(
"--dataset_prompt_json_name",
type=str,
default="prompt.json",
- help="The dataset prompt json name.", )
+ help="The dataset prompt json name.",
+)
args = parser.parse_args()
-def get_images_form_urls(ids_list,
- dir_path,
- dataset_base_name,
- type=None,
- is_resize=False):
+def get_images_form_urls(ids_list, dir_path, dataset_base_name, type=None, is_resize=False):
for i, id in enumerate(tqdm(ids_list)):
if dataset_base_name == "artv4_openpose_test13":
if type == "原图":
- img_url = (dataset_base_name_one_type_one_url_base +
- f"{id}/{id}_final00_control.png")
+ img_url = dataset_base_name_one_type_one_url_base + f"{id}/{id}_final00_control.png"
elif type == "Openpose控制图":
- img_url = (dataset_base_name_one_type_two_url_base +
- f"{id}/{id}_final00_control_openpose.png")
+ img_url = dataset_base_name_one_type_two_url_base + f"{id}/{id}_final00_control_openpose.png"
if dataset_base_name == "artv4_openpose_test2":
if type == "原图":
- img_url = (dataset_base_name_two_type_one_url_base +
- f"{id}/{id}_final00_control.png")
+ img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control.png"
elif type == "Openpose控制图":
- img_url = (dataset_base_name_two_type_one_url_base +
- f"{id}/{id}_final00_control_openpose.png")
+ img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control_openpose.png"
in_image = load_image(img_url)
if is_resize:
in_image = in_image.resize((512, 512))
@@ -93,9 +92,7 @@ def get_images_form_urls(ids_list,
def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name):
- with open(
- os.path.join(dataset_base_name, args.dataset_prompt_json_name),
- "w") as wf:
+ with open(os.path.join(dataset_base_name, args.dataset_prompt_json_name), "w") as wf:
for i, id in enumerate(ids_list):
which_prompt_list = int(id.split("_")[1][-1]) - 1
which_prompt = int(id.split("_")[-1])
@@ -112,41 +109,16 @@ def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name):
if __name__ == "__main__":
dataset_base_name = args.dataset_base_name
- ids_list = [
- line.strip()
- for line in open(
- args.ids_list_path, "r", encoding="utf8").readlines()
- ]
+ ids_list = [line.strip() for line in open(args.ids_list_path, "r", encoding="utf8").readlines()]
source_prompt_lists = [
- [
- line.strip()
- for line in open(
- args.source_prompt_list_one_path, "r", encoding="utf8")
- .readlines()
- ],
- [
- line.strip()
- for line in open(
- args.source_prompt_list_two_path, "r", encoding="utf8")
- .readlines()
- ],
- [
- line.strip()
- for line in open(
- args.source_prompt_list_three_path, "r", encoding="utf8")
- .readlines()
- ],
+ [line.strip() for line in open(args.source_prompt_list_one_path, "r", encoding="utf8").readlines()],
+ [line.strip() for line in open(args.source_prompt_list_two_path, "r", encoding="utf8").readlines()],
+ [line.strip() for line in open(args.source_prompt_list_three_path, "r", encoding="utf8").readlines()],
]
source_dir = os.path.join(dataset_base_name, "source")
target_dir = os.path.join(dataset_base_name, "target")
- get_images_form_urls(
- ids_list,
- source_dir,
- dataset_base_name,
- type="Openpose控制图",
- is_resize=False)
- get_images_form_urls(
- ids_list, target_dir, dataset_base_name, type="原图", is_resize=False)
+ get_images_form_urls(ids_list, source_dir, dataset_base_name, type="Openpose控制图", is_resize=False)
+ get_images_form_urls(ids_list, target_dir, dataset_base_name, type="原图", is_resize=False)
get_prompt_json_file(ids_list, source_prompt_lists, dataset_base_name)
diff --git a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
index 79180b0f624fe..7f5bb1a23ecb4 100644
--- a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
+++ b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
@@ -15,10 +15,14 @@
import os
import paddle
-from adapter import (AdapterLDM, AdapterLDMTrainer, DataArguments,
- ModelArguments, TextImagePair)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
- get_last_checkpoint)
+from adapter import (
+ AdapterLDM,
+ AdapterLDMTrainer,
+ DataArguments,
+ ModelArguments,
+ TextImagePair,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
from paddlenlp.utils.log import logger
@@ -28,15 +32,14 @@ def unfreeze_params(params):
def main():
- parser = PdArgumentParser(
- (ModelArguments, DataArguments, TrainingArguments))
+ parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# report to custom_visualdl
training_args.report_to = ["custom_visualdl"]
training_args.resolution = data_args.resolution
training_args.image_logging_steps = model_args.image_logging_steps = (
- math.ceil(model_args.image_logging_steps / training_args.logging_steps)
- * training_args.logging_steps)
+ math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+ )
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
@@ -44,16 +47,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -69,12 +70,14 @@ def main():
interpolation="lanczos",
tokenizer=model.tokenizer,
control_image_processor=model.control_image_processor,
- data_format=data_args.data_format, )
+ data_format=data_args.data_format,
+ )
trainer = AdapterLDMTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
# must set recompute after trainer init
trainer.model.set_recompute(training_args.recompute)
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image.py b/ppdiffusers/examples/text_to_image/train_text_to_image.py
index d9e9e7295e0d5..95328abbff75f 100644
--- a/ppdiffusers/examples/text_to_image/train_text_to_image.py
+++ b/ppdiffusers/examples/text_to_image/train_text_to_image.py
@@ -27,8 +27,9 @@
import paddle.nn.functional as F
from datasets import DatasetDict, load_dataset
from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
- fused_allreduce_gradients
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+ fused_allreduce_gradients,
+)
from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.vision import BaseTransform, transforms
@@ -38,19 +39,27 @@
from paddlenlp.utils.log import logger
from tqdm.auto import tqdm
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
- UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDPMScheduler,
+ DiffusionPipeline,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (EMAModel, freeze_params,
- main_process_first, unwrap_model)
+from ppdiffusers.training_utils import (
+ EMAModel,
+ freeze_params,
+ main_process_first,
+ unwrap_model,
+)
from ppdiffusers.utils import PPDIFFUSERS_CACHE, check_min_version
check_min_version("0.16.1")
def url_or_path_join(*path_list):
- return (os.path.join(*path_list)
- if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+ return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
class Lambda(BaseTransform):
@@ -62,11 +71,11 @@ def _apply_image(self, img):
return self.fn(img)
-def import_model_class_from_model_name_or_path(
- pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
try:
text_encoder_config = PretrainedConfig.from_pretrained(
- url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+ url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+ )
model_class = text_encoder_config.architectures[0]
except Exception:
model_class = "LDMBertModel"
@@ -75,8 +84,9 @@ def import_model_class_from_model_name_or_path(
return CLIPTextModel
elif model_class == "RobertaSeriesModelWithTransformation":
- from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
return RobertaSeriesModelWithTransformation
elif model_class == "BertModel":
@@ -84,8 +94,9 @@ def import_model_class_from_model_name_or_path(
return BertModel
elif model_class == "LDMBertModel":
- from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+ LDMBertModel,
+ )
return LDMBertModel
else:
@@ -101,8 +112,7 @@ def fn(layer):
# unet
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
model.apply(fn)
@@ -122,8 +132,7 @@ def get_report_to(args):
def parse_args(input_args=None):
- parser = argparse.ArgumentParser(
- description="Simple example of a training a text to image model script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training a text to image model script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -140,7 +149,8 @@ def parse_args(input_args=None):
parser.add_argument(
"--train_text_encoder",
action="store_true",
- help="Whether to train the text encoder.", )
+ help="Whether to train the text encoder.",
+ )
parser.add_argument(
"--dataset_name",
type=str,
@@ -148,7 +158,8 @@ def parse_args(input_args=None):
help=(
"The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
" dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
- " or to a folder containing files that 🤗 Datasets can understand."),
+ " or to a folder containing files that 🤗 Datasets can understand."
+ ),
)
parser.add_argument(
"--dataset_config_name",
@@ -164,12 +175,14 @@ def parse_args(input_args=None):
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
- ), )
+ ),
+ )
parser.add_argument(
"--image_column",
type=str,
default="image",
- help="The column of the dataset containing an image.", )
+ help="The column of the dataset containing an image.",
+ )
parser.add_argument(
"--caption_column",
type=str,
@@ -182,7 +195,9 @@ def parse_args(input_args=None):
default=None,
help=(
"For debugging purposes or quicker training, truncate the number of training examples to this "
- "value if set."), )
+ "value if set."
+ ),
+ )
parser.add_argument(
"--output_dir",
type=str,
@@ -195,32 +210,34 @@ def parse_args(input_args=None):
default=None,
help="The directory where the downloaded models and datasets will be stored.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--height",
type=int,
default=None,
help=(
"The height for input images, all the images in the train/validation dataset will be resized to this"
- " height"), )
+ " height"
+ ),
+ )
parser.add_argument(
"--width",
type=int,
default=None,
help=(
"The width for input images, all the images in the train/validation dataset will be resized to this"
- " width"), )
+ " width"
+ ),
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--center_crop",
default=False,
@@ -228,16 +245,19 @@ def parse_args(input_args=None):
help=(
"Whether to center crop the input images to the resolution. If not set, the images will be randomly"
" cropped. The images will be resized to the resolution first before cropping."
- ), )
+ ),
+ )
parser.add_argument(
"--random_flip",
action="store_true",
- help="whether to randomly flip images horizontally", )
+ help="whether to randomly flip images horizontally",
+ )
parser.add_argument(
"--train_batch_size",
type=int,
default=16,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument("--num_train_epochs", type=int, default=100)
parser.add_argument(
"--max_train_steps",
@@ -274,18 +294,22 @@ def parse_args(input_args=None):
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--snr_gamma",
type=float,
default=None,
help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
- "More details here: https://arxiv.org/abs/2303.09556.", )
+ "More details here: https://arxiv.org/abs/2303.09556.",
+ )
parser.add_argument(
"--lr_num_cycles",
type=int,
@@ -296,51 +320,49 @@ def parse_args(input_args=None):
"--lr_power",
type=float,
default=1.0,
- help="Power factor of the polynomial scheduler.", )
- parser.add_argument(
- "--use_ema", action="store_true", help="Whether to use EMA model.")
- parser.add_argument(
- "--debug",
- action="store_true",
- help="Whether to debug this training script.")
+ help="Power factor of the polynomial scheduler.",
+ )
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+ parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ), )
+ ),
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.9,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
- parser.add_argument(
- "--adam_weight_decay",
- type=float,
- default=1e-2,
- help="Weight decay to use.")
+ help="The beta2 parameter for the Adam optimizer.",
+ )
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer", )
- parser.add_argument(
- "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer",
+ )
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -353,27 +375,28 @@ def parse_args(input_args=None):
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
choices=["tensorboard", "visualdl"],
- help="Log writer type.", )
+ help="Log writer type.",
+ )
parser.add_argument(
"--checkpointing_steps",
type=int,
default=500,
- help=("Save a checkpoint of the training state every X updates."), )
+ help=("Save a checkpoint of the training state every X updates."),
+ )
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
- parser.add_argument(
- "--noise_offset",
- type=float,
- default=0,
- help="The scale of noise offset.")
+ help="Whether or not to use xformers.",
+ )
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
if input_args is not None:
args = parser.parse_args(input_args)
else:
@@ -389,9 +412,7 @@ def parse_args(input_args=None):
return args
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -401,7 +422,9 @@ def get_full_repo_name(model_id: str,
return f"{organization}/{model_id}"
-DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), }
+DATASET_NAME_MAPPING = {
+ "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
def main():
@@ -422,16 +445,13 @@ def main():
os.makedirs(args.output_dir, exist_ok=True)
if args.push_to_hub:
if args.hub_model_id is None:
- repo_name = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
create_repo(repo_name, exist_ok=True, token=args.hub_token)
- repo = Repository(
- args.output_dir, clone_from=repo_name, token=args.hub_token)
+ repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
- with open(os.path.join(args.output_dir, ".gitignore"),
- "w+") as gitignore:
+ with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
gitignore.write("step_*\n")
if "epoch_*" not in gitignore:
@@ -441,30 +461,26 @@ def main():
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
elif args.pretrained_model_name_or_path:
- tokenizer = AutoTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+ tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
# import correct text encoder class
- text_encoder_cls = import_model_class_from_model_name_or_path(
- args.pretrained_model_name_or_path)
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
# Load scheduler and models
- noise_scheduler = DDPMScheduler.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="scheduler")
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
text_encoder = text_encoder_cls.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
- text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
- else text_encoder.config.to_dict())
- if (text_config.get("use_attention_mask", None) is not None and
- text_config["use_attention_mask"]):
+ url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+ )
+ text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+ if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
use_attention_mask = True
else:
use_attention_mask = False
- vae = AutoencoderKL.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="vae")
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
- subfolder="unet", )
+ subfolder="unet",
+ )
freeze_params(vae.parameters())
if not args.train_text_encoder:
@@ -472,7 +488,8 @@ def main():
if args.use_ema:
ema_unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
- subfolder="unet", )
+ subfolder="unet",
+ )
ema_unet = EMAModel(ema_unet.parameters())
if args.gradient_checkpointing:
@@ -480,14 +497,14 @@ def main():
if args.train_text_encoder:
set_recompute(text_encoder, True)
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
def compute_snr(timesteps):
"""
@@ -495,7 +512,7 @@ def compute_snr(timesteps):
"""
alphas_cumprod = noise_scheduler.alphas_cumprod
sqrt_alphas_cumprod = alphas_cumprod**0.5
- sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+ sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
# Expand the tensors.
# Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
@@ -504,15 +521,13 @@ def compute_snr(timesteps):
sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[
- timesteps].cast("float32")
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
- sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
- None]
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
# Compute SNR.
- snr = (alpha / sigma)**2
+ snr = (alpha / sigma) ** 2
return snr
# Get the datasets: you can either provide your own training and evaluation files (see below)
@@ -523,7 +538,8 @@ def compute_snr(timesteps):
if args.debug:
file_path = get_path_from_url_with_filelock(
"https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
- PPDIFFUSERS_CACHE, )
+ PPDIFFUSERS_CACHE,
+ )
dataset = DatasetDict.load_from_disk(file_path)
args.dataset_name = "lambdalabs/pokemon-blip-captions"
else:
@@ -532,7 +548,8 @@ def compute_snr(timesteps):
dataset = load_dataset(
args.dataset_name,
args.dataset_config_name,
- cache_dir=args.cache_dir, )
+ cache_dir=args.cache_dir,
+ )
else:
data_files = {}
if args.train_data_dir is not None:
@@ -540,7 +557,8 @@ def compute_snr(timesteps):
dataset = load_dataset(
"imagefolder",
data_files=data_files,
- cache_dir=args.cache_dir, )
+ cache_dir=args.cache_dir,
+ )
# See more about loading custom images at
# https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
@@ -551,8 +569,7 @@ def compute_snr(timesteps):
# 6. Get the column names for input/target.
dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
if args.image_column is None:
- image_column = (dataset_columns[0]
- if dataset_columns is not None else column_names[0])
+ image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
else:
image_column = args.image_column
if image_column not in column_names:
@@ -560,8 +577,7 @@ def compute_snr(timesteps):
f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
)
if args.caption_column is None:
- caption_column = (dataset_columns[1]
- if dataset_columns is not None else column_names[1])
+ caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
else:
caption_column = args.caption_column
if caption_column not in column_names:
@@ -578,8 +594,7 @@ def tokenize_captions(examples, is_train=True):
captions.append(caption)
elif isinstance(caption, (list, np.ndarray)):
# take a random caption if there are multiple
- captions.append(
- random.choice(caption) if is_train else caption[0])
+ captions.append(random.choice(caption) if is_train else caption[0])
else:
raise ValueError(
f"Caption column `{caption_column}` should contain either strings or lists of strings."
@@ -589,20 +604,22 @@ def tokenize_captions(examples, is_train=True):
max_length=tokenizer.model_max_length,
padding="do_not_pad",
truncation=True,
- return_attention_mask=False, )
+ return_attention_mask=False,
+ )
return inputs.input_ids
# Preprocessing the datasets.
- train_transforms = transforms.Compose([
- transforms.Resize(
- (args.height, args.width), interpolation="bilinear"),
- transforms.CenterCrop((args.height, args.width)) if args.center_crop
- else transforms.RandomCrop((args.height, args.width)),
- transforms.RandomHorizontalFlip()
- if args.random_flip else Lambda(lambda x: x),
- transforms.ToTensor(),
- transforms.Normalize([0.5], [0.5]),
- ])
+ train_transforms = transforms.Compose(
+ [
+ transforms.Resize((args.height, args.width), interpolation="bilinear"),
+ transforms.CenterCrop((args.height, args.width))
+ if args.center_crop
+ else transforms.RandomCrop((args.height, args.width)),
+ transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
def preprocess_train(examples):
images = [image.convert("RGB") for image in examples[image_column]]
@@ -612,47 +629,42 @@ def preprocess_train(examples):
with main_process_first():
if args.max_train_samples is not None:
- dataset["train"] = (dataset["train"].shuffle(seed=args.seed)
- .select(range(args.max_train_samples)))
+ dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
# Set the training transforms
train_dataset = dataset["train"].with_transform(preprocess_train)
def collate_fn(examples):
- pixel_values = paddle.stack(
- [example["pixel_values"] for example in examples]).cast("float32")
+ pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
input_ids = [example["input_ids"] for example in examples]
input_ids = tokenizer.pad(
- {
- "input_ids": input_ids
- },
+ {"input_ids": input_ids},
padding="max_length",
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.train_batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.train_batch_size,
- shuffle=True))
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ )
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
# Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if num_processes > 1:
unet = paddle.DataParallel(unet)
@@ -660,23 +672,22 @@ def collate_fn(examples):
text_encoder = paddle.DataParallel(text_encoder)
params_to_optimize = (
- list(unet.parameters()) + list(text_encoder.parameters())
- if args.train_text_encoder else unet.parameters())
+ list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+ )
if args.scale_lr:
- args.learning_rate = (args.learning_rate *
- args.gradient_accumulation_steps *
- args.train_batch_size * num_processes)
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+ )
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps *
- args.gradient_accumulation_steps,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
num_cycles=args.lr_num_cycles,
- power=args.lr_power, )
+ power=args.lr_power,
+ )
# Initialize the optimizer
optimizer = AdamW(
learning_rate=lr_scheduler,
@@ -685,8 +696,8 @@ def collate_fn(examples):
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if is_main_process:
logger.info("----------- Configuration Arguments -----------")
@@ -696,25 +707,19 @@ def collate_fn(examples):
writer = get_report_to(args)
# Train!
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not is_main_process)
+ progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
progress_bar.set_description("Train Steps")
global_step = 0
@@ -737,20 +742,19 @@ def collate_fn(examples):
if args.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=latents.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+ )
batch_size = latents.shape[0]
# Sample a random timestep for each image
- timesteps = paddle.randint(
- 0, noise_scheduler.config.num_train_timesteps,
- (batch_size, )).cast("int64")
+ timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
- if num_processes > 1 and (args.gradient_checkpointing or (
- (step + 1) % args.gradient_accumulation_steps != 0)):
+ if num_processes > 1 and (
+ args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+ ):
# grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
# gradient_checkpointing, no_sync every where
# gradient_checkpointing + grad_acc, no_sync every where
@@ -758,68 +762,61 @@ def collate_fn(examples):
if args.train_text_encoder:
text_encoder_ctx_manager = text_encoder.no_sync()
else:
- text_encoder_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7)
- else contextlib.suppress())
+ text_encoder_ctx_manager = (
+ contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ )
else:
- unet_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
- text_encoder_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ text_encoder_ctx_manager = (
+ contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ )
with text_encoder_ctx_manager:
# Get the text embedding for conditioning
if use_attention_mask:
- attention_mask = (batch["input_ids"] !=
- tokenizer.pad_token_id).cast("int64")
+ attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
else:
attention_mask = None
- encoder_hidden_states = text_encoder(
- batch["input_ids"], attention_mask=attention_mask)[0]
+ encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
with unet_ctx_manager:
# Predict the noise residual / sample
- model_pred = unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise,
- timesteps)
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {noise_scheduler.config.prediction_type}"
- )
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
if args.snr_gamma is None:
loss = F.mse_loss(
model_pred.cast("float32"),
target.cast("float32"),
- reduction="mean", )
+ reduction="mean",
+ )
else:
# Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
# Since we predict the noise instead of x_0, the original formulation is slightly changed.
# This is discussed in Section 4.2 of the same paper.
snr = compute_snr(timesteps)
- mse_loss_weights = (paddle.stack(
- [
- snr,
- args.snr_gamma * paddle.ones_like(timesteps)
- ],
- axis=1, ).min(1)[0] / snr)
+ mse_loss_weights = (
+ paddle.stack([snr, args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min(
+ 1
+ )[0]
+ / snr
+ )
# We first calculate the original loss. Then we mean over the non-batch dimensions and
# rebalance the sample-wise losses with their respective loss weights.
# Finally, we take the mean of the rebalanced loss.
loss = F.mse_loss(
model_pred.cast("float32"),
target.cast("float32"),
- reduction="none", )
- loss = (loss.mean(axis=list(range(1, len(loss.shape))))
- * mse_loss_weights)
+ reduction="none",
+ )
+ loss = loss.mean(axis=list(range(1, len(loss.shape)))) * mse_loss_weights
loss = loss.mean()
if args.gradient_accumulation_steps > 1:
@@ -851,13 +848,10 @@ def collate_fn(examples):
writer.add_scalar(f"train/{name}", val, global_step)
if global_step % args.checkpointing_steps == 0:
- save_path = os.path.join(args.output_dir,
- f"checkpoint-{global_step}")
- unwrap_model(unet).save_pretrained(
- os.path.join(save_path, "unet"))
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+ unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
if args.train_text_encoder:
- unwrap_model(text_encoder).save_pretrained(
- os.path.join(save_path, "text_encoder"))
+ unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
if global_step >= args.max_train_steps:
break
@@ -871,14 +865,12 @@ def collate_fn(examples):
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=unet,
- text_encoder=unwrap_model(text_encoder), )
+ text_encoder=unwrap_model(text_encoder),
+ )
pipeline.save_pretrained(args.output_dir)
if args.push_to_hub:
- repo.push_to_hub(
- commit_message="End of training",
- blocking=False,
- auto_lfs_prune=True)
+ repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
index b07bc09c1d1ae..611aebd6a5dc0 100644
--- a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
+++ b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
@@ -29,8 +29,9 @@
import paddle.nn.functional as F
from datasets import DatasetDict, load_dataset
from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
- fused_allreduce_gradients
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+ fused_allreduce_gradients,
+)
from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.vision import BaseTransform, transforms
@@ -40,31 +41,37 @@
from paddlenlp.utils.log import logger
from tqdm.auto import tqdm
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
- DPMSolverMultistepScheduler, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDPMScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
from ppdiffusers.models.attention_processor import (
- AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5)
+ AttnProcessor,
+ AttnProcessor2_5,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+)
from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (freeze_params, main_process_first,
- unwrap_model)
-from ppdiffusers.utils import (PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE,
- check_min_version)
+from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
+from ppdiffusers.utils import (
+ PPDIFFUSERS_CACHE,
+ TEXT_ENCODER_ATTN_MODULE,
+ check_min_version,
+)
check_min_version("0.16.1")
def url_or_path_join(*path_list):
- return (os.path.join(*path_list)
- if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+ return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-def save_model_card(repo_id: str,
- images=None,
- base_model=str,
- dataset_name=str,
- repo_folder=None):
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
img_str = ""
for i, image in enumerate(images):
image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -94,11 +101,11 @@ def save_model_card(repo_id: str,
f.write(yaml + model_card)
-def import_model_class_from_model_name_or_path(
- pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
try:
text_encoder_config = PretrainedConfig.from_pretrained(
- url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+ url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+ )
model_class = text_encoder_config.architectures[0]
except Exception:
model_class = "LDMBertModel"
@@ -107,8 +114,9 @@ def import_model_class_from_model_name_or_path(
return CLIPTextModel
elif model_class == "RobertaSeriesModelWithTransformation":
- from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
return RobertaSeriesModelWithTransformation
elif model_class == "BertModel":
@@ -116,8 +124,9 @@ def import_model_class_from_model_name_or_path(
return BertModel
elif model_class == "LDMBertModel":
- from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+ LDMBertModel,
+ )
return LDMBertModel
else:
@@ -148,8 +157,7 @@ def get_report_to(args):
def parse_args(input_args=None):
- parser = argparse.ArgumentParser(
- description="Simple example of a training text to image lora script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training text to image lora script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -170,7 +178,8 @@ def parse_args(input_args=None):
help=(
"The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
" dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
- " or to a folder containing files that 🤗 Datasets can understand."),
+ " or to a folder containing files that 🤗 Datasets can understand."
+ ),
)
parser.add_argument(
"--dataset_config_name",
@@ -186,12 +195,14 @@ def parse_args(input_args=None):
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
- ), )
+ ),
+ )
parser.add_argument(
"--image_column",
type=str,
default="image",
- help="The column of the dataset containing an image.", )
+ help="The column of the dataset containing an image.",
+ )
parser.add_argument(
"--caption_column",
type=str,
@@ -202,7 +213,8 @@ def parse_args(input_args=None):
"--validation_prompt",
type=str,
default=None,
- help="A prompt that is sampled during training for inference.", )
+ help="A prompt that is sampled during training for inference.",
+ )
parser.add_argument(
"--num_validation_images",
type=int,
@@ -216,14 +228,17 @@ def parse_args(input_args=None):
help=(
"Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
" `args.validation_prompt` multiple times: `args.num_validation_images`."
- ), )
+ ),
+ )
parser.add_argument(
"--max_train_samples",
type=int,
default=None,
help=(
"For debugging purposes or quicker training, truncate the number of training examples to this "
- "value if set."), )
+ "value if set."
+ ),
+ )
parser.add_argument(
"--output_dir",
type=str,
@@ -236,32 +251,34 @@ def parse_args(input_args=None):
default=None,
help="The directory where the downloaded models and datasets will be stored.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--height",
type=int,
default=None,
help=(
"The height for input images, all the images in the train/validation dataset will be resized to this"
- " height"), )
+ " height"
+ ),
+ )
parser.add_argument(
"--width",
type=int,
default=None,
help=(
"The width for input images, all the images in the train/validation dataset will be resized to this"
- " width"), )
+ " width"
+ ),
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--center_crop",
default=False,
@@ -269,21 +286,25 @@ def parse_args(input_args=None):
help=(
"Whether to center crop the input images to the resolution. If not set, the images will be randomly"
" cropped. The images will be resized to the resolution first before cropping."
- ), )
+ ),
+ )
parser.add_argument(
"--lora_rank",
type=int,
default=4,
- help="The rank of lora linear.", )
+ help="The rank of lora linear.",
+ )
parser.add_argument(
"--random_flip",
action="store_true",
- help="whether to randomly flip images horizontally", )
+ help="whether to randomly flip images horizontally",
+ )
parser.add_argument(
"--train_batch_size",
type=int,
default=16,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument(
"--train_text_encoder",
action="store_true",
@@ -300,7 +321,8 @@ def parse_args(input_args=None):
"--checkpointing_steps",
type=int,
default=500,
- help=("Save a checkpoint of the training state every X updates."), )
+ help=("Save a checkpoint of the training state every X updates."),
+ )
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
@@ -330,12 +352,15 @@ def parse_args(input_args=None):
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--lr_num_cycles",
type=int,
@@ -346,49 +371,48 @@ def parse_args(input_args=None):
"--lr_power",
type=float,
default=1.0,
- help="Power factor of the polynomial scheduler.", )
- parser.add_argument(
- "--debug",
- action="store_true",
- help="Whether to debug this training script.")
+ help="Power factor of the polynomial scheduler.",
+ )
+ parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ), )
+ ),
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.9,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
- parser.add_argument(
- "--adam_weight_decay",
- type=float,
- default=1e-2,
- help="Weight decay to use.")
+ help="The beta2 parameter for the Adam optimizer.",
+ )
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer", )
- parser.add_argument(
- "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer",
+ )
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -401,22 +425,22 @@ def parse_args(input_args=None):
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
choices=["tensorboard", "visualdl"],
- help="Log writer type.", )
+ help="Log writer type.",
+ )
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
- parser.add_argument(
- "--noise_offset",
- type=float,
- default=0,
- help="The scale of noise offset.")
+ help="Whether or not to use xformers.",
+ )
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
if input_args is not None:
args = parser.parse_args(input_args)
else:
@@ -432,9 +456,7 @@ def parse_args(input_args=None):
return args
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -444,7 +466,9 @@ def get_full_repo_name(model_id: str,
return f"{organization}/{model_id}"
-DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), }
+DATASET_NAME_MAPPING = {
+ "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
def main():
@@ -465,16 +489,13 @@ def main():
os.makedirs(args.output_dir, exist_ok=True)
if args.push_to_hub:
if args.hub_model_id is None:
- repo_name = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
create_repo(repo_name, exist_ok=True, token=args.hub_token)
- repo = Repository(
- args.output_dir, clone_from=repo_name, token=args.hub_token)
+ repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
- with open(os.path.join(args.output_dir, ".gitignore"),
- "w+") as gitignore:
+ with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
gitignore.write("step_*\n")
if "epoch_*" not in gitignore:
@@ -484,44 +505,40 @@ def main():
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
elif args.pretrained_model_name_or_path:
- tokenizer = AutoTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+ tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
# import correct text encoder class
- text_encoder_cls = import_model_class_from_model_name_or_path(
- args.pretrained_model_name_or_path)
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
# Load scheduler and models
- noise_scheduler = DDPMScheduler.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="scheduler")
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
text_encoder = text_encoder_cls.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
- text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
- else text_encoder.config.to_dict())
- if (text_config.get("use_attention_mask", None) is not None and
- text_config["use_attention_mask"]):
+ url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+ )
+ text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+ if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
use_attention_mask = True
else:
use_attention_mask = False
- vae = AutoencoderKL.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="vae")
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
- subfolder="unet", )
+ subfolder="unet",
+ )
# We only train the additional adapter LoRA layers
freeze_params(vae.parameters())
freeze_params(text_encoder.parameters())
freeze_params(unet.parameters())
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warning(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
# now we will add new LoRA weights to the attention layers
# It's important to realize here how many attention weights will be added and of which sizes
# The sizes of the attention layers consist only of two different variables:
@@ -538,14 +555,12 @@ def main():
# Set correct lora layers
unet_lora_attn_procs = {}
for name, attn_processor in unet.attn_processors.items():
- cross_attention_dim = (None if name.endswith("attn1.processor") else
- unet.config.cross_attention_dim)
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(unet.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = unet.config.block_out_channels[block_id]
@@ -555,14 +570,13 @@ def main():
elif isinstance(attn_processor, AttnProcessor2_5):
lora_attn_processor_class = LoRAAttnProcessor2_5
else:
- raise ValueError(
- f"Unknown attention processor type: {attn_processor.__class__.__name__}"
- )
+ raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
unet_lora_attn_procs[name] = lora_attn_processor_class(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
- rank=args.lora_rank, )
+ rank=args.lora_rank,
+ )
unet.set_attn_processor(unet_lora_attn_procs)
unet_lora_layers = AttnProcsLayers(unet.attn_processors)
@@ -578,10 +592,12 @@ def main():
text_lora_attn_procs[name] = LoRAAttnProcessor(
hidden_size=module.out_proj.weight.shape[1],
cross_attention_dim=None,
- rank=args.lora_rank, )
+ rank=args.lora_rank,
+ )
text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
temp_pipeline = DiffusionPipeline.from_pretrained(
- args.pretrained_model_name_or_path, text_encoder=text_encoder)
+ args.pretrained_model_name_or_path, text_encoder=text_encoder
+ )
temp_pipeline._modify_text_encoder(text_lora_attn_procs)
text_encoder = temp_pipeline.text_encoder
del temp_pipeline
@@ -594,7 +610,8 @@ def main():
if args.debug:
file_path = get_path_from_url_with_filelock(
"https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
- PPDIFFUSERS_CACHE, )
+ PPDIFFUSERS_CACHE,
+ )
dataset = DatasetDict.load_from_disk(file_path)
args.dataset_name = "lambdalabs/pokemon-blip-captions"
else:
@@ -603,7 +620,8 @@ def main():
dataset = load_dataset(
args.dataset_name,
args.dataset_config_name,
- cache_dir=args.cache_dir, )
+ cache_dir=args.cache_dir,
+ )
else:
data_files = {}
if args.train_data_dir is not None:
@@ -611,7 +629,8 @@ def main():
dataset = load_dataset(
"imagefolder",
data_files=data_files,
- cache_dir=args.cache_dir, )
+ cache_dir=args.cache_dir,
+ )
# See more about loading custom images at
# https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
@@ -622,8 +641,7 @@ def main():
# 6. Get the column names for input/target.
dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
if args.image_column is None:
- image_column = (dataset_columns[0]
- if dataset_columns is not None else column_names[0])
+ image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
else:
image_column = args.image_column
if image_column not in column_names:
@@ -631,8 +649,7 @@ def main():
f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
)
if args.caption_column is None:
- caption_column = (dataset_columns[1]
- if dataset_columns is not None else column_names[1])
+ caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
else:
caption_column = args.caption_column
if caption_column not in column_names:
@@ -649,8 +666,7 @@ def tokenize_captions(examples, is_train=True):
captions.append(caption)
elif isinstance(caption, (list, np.ndarray)):
# take a random caption if there are multiple
- captions.append(
- random.choice(caption) if is_train else caption[0])
+ captions.append(random.choice(caption) if is_train else caption[0])
else:
raise ValueError(
f"Caption column `{caption_column}` should contain either strings or lists of strings."
@@ -660,20 +676,22 @@ def tokenize_captions(examples, is_train=True):
max_length=tokenizer.model_max_length,
padding="do_not_pad",
truncation=True,
- return_attention_mask=False, )
+ return_attention_mask=False,
+ )
return inputs.input_ids
# Preprocessing the datasets.
- train_transforms = transforms.Compose([
- transforms.Resize(
- (args.height, args.width), interpolation="bilinear"),
- transforms.CenterCrop((args.height, args.width)) if args.center_crop
- else transforms.RandomCrop((args.height, args.width)),
- transforms.RandomHorizontalFlip()
- if args.random_flip else Lambda(lambda x: x),
- transforms.ToTensor(),
- transforms.Normalize([0.5], [0.5]),
- ])
+ train_transforms = transforms.Compose(
+ [
+ transforms.Resize((args.height, args.width), interpolation="bilinear"),
+ transforms.CenterCrop((args.height, args.width))
+ if args.center_crop
+ else transforms.RandomCrop((args.height, args.width)),
+ transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
def preprocess_train(examples):
images = [image.convert("RGB") for image in examples[image_column]]
@@ -683,67 +701,62 @@ def preprocess_train(examples):
with main_process_first():
if args.max_train_samples is not None:
- dataset["train"] = (dataset["train"].shuffle(seed=args.seed)
- .select(range(args.max_train_samples)))
+ dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
# Set the training transforms
train_dataset = dataset["train"].with_transform(preprocess_train)
def collate_fn(examples):
- pixel_values = paddle.stack(
- [example["pixel_values"] for example in examples]).cast("float32")
+ pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
input_ids = [example["input_ids"] for example in examples]
input_ids = tokenizer.pad(
- {
- "input_ids": input_ids
- },
+ {"input_ids": input_ids},
padding="max_length",
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.train_batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.train_batch_size,
- shuffle=True))
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ )
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
# Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if args.scale_lr:
- args.learning_rate = (args.learning_rate *
- args.gradient_accumulation_steps *
- args.train_batch_size * num_processes)
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+ )
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps *
- args.gradient_accumulation_steps,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
num_cycles=args.lr_num_cycles,
- power=args.lr_power, )
+ power=args.lr_power,
+ )
- params_to_optimize = (list(unet_lora_layers.parameters()) +
- list(text_encoder_lora_layers.parameters())
- if args.train_text_encoder else
- unet_lora_layers.parameters())
+ params_to_optimize = (
+ list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
+ if args.train_text_encoder
+ else unet_lora_layers.parameters()
+ )
# Optimizer creation
optimizer = AdamW(
learning_rate=lr_scheduler,
@@ -752,8 +765,8 @@ def collate_fn(examples):
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if num_processes > 1:
unet = paddle.DataParallel(unet)
@@ -768,25 +781,19 @@ def collate_fn(examples):
writer = get_report_to(args)
# Train!
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not is_main_process)
+ progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
progress_bar.set_description("Train Steps")
global_step = 0
vae.eval()
@@ -807,52 +814,43 @@ def collate_fn(examples):
if args.noise_offset:
# https://www.crosslabs.org/blog/diffusion-with-offset-noise
noise += args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=latents.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+ )
batch_size = latents.shape[0]
# Sample a random timestep for each image
- timesteps = paddle.randint(
- 0, noise_scheduler.config.num_train_timesteps,
- (batch_size, )).cast("int64")
+ timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
- if num_processes > 1 and (args.gradient_checkpointing or (
- (step + 1) % args.gradient_accumulation_steps != 0)):
+ if num_processes > 1 and (
+ args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+ ):
# grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
# gradient_checkpointing, no_sync every where
# gradient_checkpointing + grad_acc, no_sync every where
unet_ctx_manager = unet.no_sync()
else:
- unet_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
if use_attention_mask:
- attention_mask = (
- batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
+ attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
else:
attention_mask = None
- encoder_hidden_states = text_encoder(
- batch["input_ids"], attention_mask=attention_mask)[0]
+ encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
with unet_ctx_manager:
# Predict the noise residual / sample
- model_pred = unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise,
- timesteps)
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {noise_scheduler.config.prediction_type}"
- )
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
loss = F.mse_loss(model_pred, target, reduction="mean")
@@ -883,52 +881,51 @@ def collate_fn(examples):
writer.add_scalar(f"train/{name}", val, global_step)
if global_step % args.checkpointing_steps == 0:
- save_path = os.path.join(args.output_dir,
- f"checkpoint-{global_step}")
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
# We combine the text encoder and UNet LoRA parameters with a simple
# custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
LoraLoaderMixin.save_lora_weights(
save_directory=save_path,
unet_lora_layers=unet_lora_layers,
- text_encoder_lora_layers=text_encoder_lora_layers, )
+ text_encoder_lora_layers=text_encoder_lora_layers,
+ )
logger.info(f"Saved lora weights to {save_path}")
if global_step >= args.max_train_steps:
break
if is_main_process:
- if (args.validation_prompt is not None and
- epoch % args.validation_epochs == 0):
+ if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
logger.info(
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
- f" {args.validation_prompt}.")
+ f" {args.validation_prompt}."
+ )
# create pipeline
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=unwrap_model(unet),
text_encoder=unwrap_model(text_encoder),
safety_checker=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
pipeline.set_progress_bar_config(disable=True)
# run inference
- generator = (paddle.Generator().manual_seed(args.seed)
- if args.seed else None)
+ generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
images = [
pipeline(
args.validation_prompt,
num_inference_steps=30,
- generator=generator, ).images[0]
+ generator=generator,
+ ).images[0]
for _ in range(args.num_validation_images)
]
np_images = np.stack([np.asarray(img) for img in images])
if args.report_to == "tensorboard":
- writer.add_images(
- "validation", np_images, epoch, dataformats="NHWC")
+ writer.add_images("validation", np_images, epoch, dataformats="NHWC")
else:
- writer.add_image(
- "validation", np_images, epoch, dataformats="NHWC")
+ writer.add_image("validation", np_images, epoch, dataformats="NHWC")
del pipeline
gc.collect()
@@ -941,7 +938,8 @@ def collate_fn(examples):
LoraLoaderMixin.save_lora_weights(
save_directory=args.output_dir,
unet_lora_layers=unet_lora_layers,
- text_encoder_lora_layers=text_encoder_lora_layers, )
+ text_encoder_lora_layers=text_encoder_lora_layers,
+ )
if args.push_to_hub:
save_model_card(
@@ -949,31 +947,25 @@ def collate_fn(examples):
images=images,
base_model=args.pretrained_model_name_or_path,
prompt=args.instance_prompt,
- repo_folder=args.output_dir, )
- repo.push_to_hub(
- commit_message="End of training",
- blocking=False,
- auto_lfs_prune=True)
+ repo_folder=args.output_dir,
+ )
+ repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
# Final inference
# Load previous pipeline
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
safety_checker=None,
- requires_safety_checker=False, )
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
- pipeline.scheduler.config)
+ requires_safety_checker=False,
+ )
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
# load attention processors
pipeline.load_lora_weights(args.output_dir)
# run inference
if args.validation_prompt and args.num_validation_images > 0:
- generator = paddle.Generator().manual_seed(
- args.seed) if args.seed else None
+ generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
images = [
- pipeline(
- args.validation_prompt,
- num_inference_steps=30,
- generator=generator).images[0]
+ pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
for _ in range(args.num_validation_images)
]
np_images = np.stack([np.asarray(img) for img in images])
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
index 9f7e732a9033e..c8527964620b4 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
@@ -20,9 +20,13 @@
import pandas as pd
from tqdm.auto import tqdm
-from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler,
- LDMTextToImagePipeline, LMSDiscreteScheduler,
- PNDMScheduler)
+from ppdiffusers import (
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LDMTextToImagePipeline,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
def batchify(data, batch_size=16):
@@ -37,18 +41,19 @@ def batchify(data, batch_size=16):
def generate_images(
- model_name_or_path,
- batch_size=16,
- file="coco30k.csv",
- save_path="output",
- seed=42,
- scheduler_type="ddim",
- eta=0.0,
- num_inference_steps=50,
- guidance_scales=[3, 4, 5, 6, 7, 8],
- height=256,
- width=256,
- device="gpu", ):
+ model_name_or_path,
+ batch_size=16,
+ file="coco30k.csv",
+ save_path="output",
+ seed=42,
+ scheduler_type="ddim",
+ eta=0.0,
+ num_inference_steps=50,
+ guidance_scales=[3, 4, 5, 6, 7, 8],
+ height=256,
+ width=256,
+ device="gpu",
+):
paddle.set_device(device)
pipe = LDMTextToImagePipeline.from_pretrained(model_name_or_path)
pipe.set_progress_bar_config(disable=True)
@@ -62,17 +67,14 @@ def generate_images(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif scheduler_type == "euler-ancestral":
scheduler = EulerAncestralDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+ )
elif scheduler_type == "ddim":
scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -81,7 +83,8 @@ def generate_images(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
pipe.scheduler = scheduler
@@ -103,7 +106,8 @@ def generate_images(
eta=eta,
height=height,
width=width,
- num_inference_steps=num_inference_steps, )[0]
+ num_inference_steps=num_inference_steps,
+ )[0]
for image in images:
path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
image.save(path)
@@ -117,17 +121,20 @@ def generate_images(
default=None,
type=str,
required=True,
- help="model_name_or_path.", )
+ help="model_name_or_path.",
+ )
parser.add_argument(
"--file",
default="./coco30k.tsv",
type=str,
- help="eval file.", )
+ help="eval file.",
+ )
parser.add_argument(
"--seed",
default=42,
type=int,
- help="random seed.", )
+ help="random seed.",
+ )
parser.add_argument(
"--scheduler_type",
default="ddim",
@@ -137,22 +144,20 @@ def generate_images(
)
parser.add_argument("--device", default="gpu", type=str, help="device")
parser.add_argument("--batch_size", default=16, type=int, help="batch_size")
- parser.add_argument(
- "--num_inference_steps",
- default=50,
- type=int,
- help="num_inference_steps")
+ parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps")
parser.add_argument(
"--save_path",
default="output/1.5b_ldm/12w.pd",
type=str,
- help="Path to the output file.", )
+ help="Path to the output file.",
+ )
parser.add_argument(
"--guidance_scales",
default=[3, 4, 5, 6, 7, 8],
nargs="+",
type=str,
- help="guidance_scales list.", )
+ help="guidance_scales list.",
+ )
parser.add_argument("--height", default=256, type=int, help="height.")
parser.add_argument("--width", default=256, type=int, help="width.")
args = parser.parse_args()
@@ -171,4 +176,5 @@ def generate_images(
scheduler_type=args.scheduler_type,
height=args.height,
width=args.width,
- device=args.device, )
+ device=args.device,
+ )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
index c89f6fd190bf7..069fde479ce3d 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
@@ -19,8 +19,13 @@
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.utils.log import logger
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMBertModel,
- LDMTextToImagePipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LDMBertModel,
+ LDMTextToImagePipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
@@ -30,27 +35,32 @@ def parse_args():
"--model_file",
type=str,
default="./model_state.pdparams",
- help="path to pretrained model_state.pdparams", )
+ help="path to pretrained model_state.pdparams",
+ )
parser.add_argument(
"--output_path",
type=str,
default="./ldm_pipelines",
- help="the output path of pipeline.", )
+ help="the output path of pipeline.",
+ )
parser.add_argument(
"--vae_name_or_path",
type=str,
default="CompVis/stable-diffusion-v1-4/vae",
- help="pretrained_vae_name_or_path.", )
+ help="pretrained_vae_name_or_path.",
+ )
parser.add_argument(
"--text_encoder_config_file",
type=str,
default="./config/ldmbert.json",
- help="text_encoder_config_file.", )
+ help="text_encoder_config_file.",
+ )
parser.add_argument(
"--unet_config_file",
type=str,
default="./config/unet.json",
- help="unet_config_file.", )
+ help="unet_config_file.",
+ )
parser.add_argument(
"--tokenizer_name_or_path",
type=str,
@@ -61,12 +71,9 @@ def parse_args():
"--model_max_length",
type=int,
default=77,
- help="Pretrained tokenizer model_max_length.", )
- parser.add_argument(
- "--device",
- type=str,
- default=None,
- help="Device to use. Like gpu:0 or cpu")
+ help="Pretrained tokenizer model_max_length.",
+ )
+ parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu:0 or cpu")
return parser.parse_args()
@@ -119,17 +126,17 @@ def check_keys(model, state_dict):
def build_pipelines(
- model_file,
- output_path,
- vae_name_or_path,
- unet_config_file,
- text_encoder_config_file,
- tokenizer_name_or_path="bert-base-uncased",
- model_max_length=77, ):
+ model_file,
+ output_path,
+ vae_name_or_path,
+ unet_config_file,
+ text_encoder_config_file,
+ tokenizer_name_or_path="bert-base-uncased",
+ model_max_length=77,
+):
vae = AutoencoderKL.from_config(vae_name_or_path)
unet = UNet2DConditionModel(**read_json(unet_config_file))
- tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name_or_path, model_max_length=model_max_length)
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=model_max_length)
text_encoder_config = read_json(text_encoder_config_file)
vocab_size = text_encoder_config["vocab_size"]
max_position_embeddings = text_encoder_config["max_position_embeddings"]
@@ -143,8 +150,7 @@ def build_pipelines(
logger.info(
f"The tokenizer's model_max_length {tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {tokenizer.model_max_length} as max_position_embeddings!"
)
- text_encoder_config[
- "max_position_embeddings"] = tokenizer.model_max_length
+ text_encoder_config["max_position_embeddings"] = tokenizer.model_max_length
cofnig = LDMBertConfig(**text_encoder_config)
text_encoder = LDMBertModel(cofnig)
scheduler = DDIMScheduler(
@@ -154,7 +160,8 @@ def build_pipelines(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
unet_dict, vae_dict, text_encoder_dict = extract_paramaters(model_file)
check_keys(unet, unet_dict)
check_keys(vae, vae_dict)
@@ -167,7 +174,8 @@ def build_pipelines(
tokenizer=tokenizer,
scheduler=scheduler,
vqvae=vae,
- unet=unet, )
+ unet=unet,
+ )
pipe.save_pretrained(output_path)
@@ -182,4 +190,5 @@ def build_pipelines(
unet_config_file=args.unet_config_file,
text_encoder_config_file=args.text_encoder_config_file,
tokenizer_name_or_path=args.tokenizer_name_or_path,
- model_max_length=args.model_max_length, )
+ model_max_length=args.model_max_length,
+ )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
index f7c2e091bed03..0443a7224578e 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
@@ -29,51 +29,43 @@ class ModelArguments:
# use pretrained vae kl-8.ckpt (CompVis/stable-diffusion-v1-4/vae)
vae_name_or_path: Optional[str] = field(
default="CompVis/stable-diffusion-v1-4/vae",
- metadata={"help": "pretrained_vae_name_or_path"}, )
+ metadata={"help": "pretrained_vae_name_or_path"},
+ )
text_encoder_config_file: Optional[str] = field(
- default="./config/ldmbert.json",
- metadata={"help": "text_encoder_config_file"})
- unet_config_file: Optional[str] = field(
- default="./config/unet.json", metadata={"help": "unet_config_file"})
+ default="./config/ldmbert.json", metadata={"help": "text_encoder_config_file"}
+ )
+ unet_config_file: Optional[str] = field(default="./config/unet.json", metadata={"help": "unet_config_file"})
tokenizer_name: Optional[str] = field(
default="bert-base-uncased",
- metadata={
- "help":
- "Pretrained tokenizer name or path if not the same as model_name"
- }, )
- model_max_length: Optional[int] = field(
- default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
- num_inference_steps: Optional[int] = field(
- default=200, metadata={"help": "num_inference_steps"})
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+ )
+ model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+ num_inference_steps: Optional[int] = field(default=200, metadata={"help": "num_inference_steps"})
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
pretrained_model_name_or_path: str = field(
default=None,
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
- image_logging_steps: Optional[int] = field(
- default=1000, metadata={"help": "Log image every X steps."})
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
+ image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable_xformers_memory_efficient_attention."})
- to_static: bool = field(
- default=False, metadata={"help": "Whether or not to_static"})
+ default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+ )
+ to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
prediction_type: Optional[str] = field(
default="epsilon",
metadata={
- "help":
- "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
- }, )
+ "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+ },
+ )
benchmark: bool = field(
default=False,
- metadata={"help": "Whether or not run benchmark."}, )
+ metadata={"help": "Whether or not run benchmark."},
+ )
profiler_options: Optional[str] = field(
default=None,
- metadata={"help": "profiler_options."}, )
- noise_offset: Optional[int] = field(
- default=0, metadata={"help": "The scale of noise offset."})
+ metadata={"help": "profiler_options."},
+ )
+ noise_offset: Optional[int] = field(default=0, metadata={"help": "The scale of noise offset."})
@dataclass
@@ -84,113 +76,89 @@ class DataArguments:
file_list: str = field(
default="./data/filelist/train.filelist.list",
- metadata={"help": "The name of the file_list."}, )
+ metadata={"help": "The name of the file_list."},
+ )
resolution: int = field(
default=256,
metadata={
- "help":
- "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
- }, )
+ "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+ },
+ )
num_records: int = field(default=10000000, metadata={"help": "num_records"})
buffer_size: int = field(
default=100,
- metadata={"help": "Buffer size"}, )
+ metadata={"help": "Buffer size"},
+ )
shuffle_every_n_samples: int = field(
default=5,
- metadata={"help": "shuffle_every_n_samples."}, )
+ metadata={"help": "shuffle_every_n_samples."},
+ )
@dataclass
class NoTrainerTrainingArguments:
output_dir: str = field(
default="outputs",
- metadata={
- "help":
- "The output directory where the model predictions and checkpoints will be written."
- }, )
+ metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+ )
per_device_train_batch_size: int = field(
- default=16,
- metadata={"help": "Batch size per GPU core/CPU for training."})
+ default=16, metadata={"help": "Batch size per GPU core/CPU for training."}
+ )
gradient_accumulation_steps: int = field(
default=2,
- metadata={
- "help":
- "Number of updates steps to accumulate before performing a backward/update pass."
- }, )
- learning_rate: float = field(
- default=5e-5,
- metadata={"help": "The initial learning rate for AdamW."})
- weight_decay: float = field(
- default=0.02,
- metadata={"help": "Weight decay for AdamW if we apply some."})
- adam_beta1: float = field(
- default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
- adam_beta2: float = field(
- default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
- adam_epsilon: float = field(
- default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
- max_grad_norm: float = field(
- default=-1.0, metadata={"help": "Max gradient norm."})
- num_train_epochs: int = field(
- default=100,
- metadata={"help": "Total number of training epochs to perform."})
+ metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
+ )
+ learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+ weight_decay: float = field(default=0.02, metadata={"help": "Weight decay for AdamW if we apply some."})
+ adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+ adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+ adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+ max_grad_norm: float = field(default=-1.0, metadata={"help": "Max gradient norm."})
+ num_train_epochs: int = field(default=100, metadata={"help": "Total number of training epochs to perform."})
max_steps: int = field(
default=1000000000,
- metadata={
- "help":
- "If > 0: set total number of training steps to perform. Override num_train_epochs."
- }, )
+ metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
+ )
lr_scheduler_type: str = field(
default="constant",
metadata={
- "help":
- 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]'
- }, )
- warmup_steps: int = field(
- default=0, metadata={"help": "Linear warmup over warmup_steps."})
+ "help": 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]'
+ },
+ )
+ warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
- logging_dir: Optional[str] = field(
- default="logs", metadata={"help": "VisualDL log dir."})
+ logging_dir: Optional[str] = field(default="logs", metadata={"help": "VisualDL log dir."})
- logging_steps: int = field(
- default=50, metadata={"help": "Log every X updates steps."})
+ logging_steps: int = field(default=50, metadata={"help": "Log every X updates steps."})
- save_steps: int = field(
- default=5000,
- metadata={"help": "Save checkpoint every X updates steps."})
+ save_steps: int = field(default=5000, metadata={"help": "Save checkpoint every X updates steps."})
seed: int = field(
default=23,
- metadata={
- "help": "Random seed that will be set at the beginning of training."
- }, )
+ metadata={"help": "Random seed that will be set at the beginning of training."},
+ )
dataloader_num_workers: int = field(
default=6,
metadata={
- "help":
- "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- }, )
+ "help": "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+ },
+ )
report_to: str = field(
default="visualdl",
- metadata={
- "help":
- "The list of integrations to report the results and logs to."
- }, )
+ metadata={"help": "The list of integrations to report the results and logs to."},
+ )
recompute: bool = field(
default=False,
metadata={
- "help":
- "Recompute the forward pass to calculate gradients. Used for saving memory. "
+ "help": "Recompute the forward pass to calculate gradients. Used for saving memory. "
"Only support for networks with transformer blocks."
- }, )
+ },
+ )
def __str__(self):
self_as_dict = asdict(self)
- self_as_dict = {
- k: f"<{k.upper()}>" if k.endswith("_token") else v
- for k, v in self_as_dict.items()
- }
+ self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
@@ -207,8 +175,7 @@ def print_config(self, args=None, key=""):
key = "Training"
logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
- logger.info("{:30}:{}".format("paddle commit id",
- paddle.version.commit))
+ logger.info("{:30}:{}".format("paddle commit id", paddle.version.commit))
for a in dir(args):
if a[:2] != "__": # don't print double underscore methods
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
index 9103c0221f18a..6a99ea7a8f8bc 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
@@ -20,7 +20,11 @@
from paddle.io import DataLoader
from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
from paddlenlp.trainer.integrations import (
- INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs)
+ INTEGRATION_TO_CALLBACK,
+ TrainerCallback,
+ VisualDLCallback,
+ rewrite_logs,
+)
from paddlenlp.utils import profiler
from paddlenlp.utils.log import logger
@@ -38,19 +42,17 @@ def autocast_smart_context_manager(self, args):
"c_softmax_with_cross_entropy",
],
level=args.fp16_opt_level,
- dtype=amp_dtype, )
+ dtype=amp_dtype,
+ )
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
return ctx_manager
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def on_log(self, args, state, control, logs=None, **kwargs):
@@ -58,22 +60,26 @@ def on_log(self, args, state, control, logs=None, **kwargs):
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
with self.autocast_smart_context_manager(args):
- image_logs["reconstruction"] = model.decode_image(
- pixel_values=inputs["pixel_values"])
+ image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
image_logs["ddim-samples-1.0"] = model.log_image(
input_ids=inputs["input_ids"],
guidance_scale=1.0,
height=args.resolution,
- width=args.resolution, )
+ width=args.resolution,
+ )
image_logs["ddim-samples-7.5"] = model.log_image(
input_ids=inputs["input_ids"],
guidance_scale=7.5,
height=args.resolution,
- width=args.resolution, )
+ width=args.resolution,
+ )
if not state.is_world_process_zero:
return
@@ -91,11 +97,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
@@ -136,8 +142,7 @@ def __init__(self, benchmark=True, profiler_options=None):
self.profiler_options = profiler_options
def on_train_begin(self, args, state, control, **kwargs):
- assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
- not args.do_predict)
+ assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
if self.benchmark:
self.reader_cost_avg = AverageStatistical()
@@ -162,8 +167,7 @@ def on_step_end(self, args, state, control, **kwargs):
def on_log(self, args, state, control, logs=None, **kwargs):
if self.benchmark:
if logs is not None and "interval_steps_per_second" in logs:
- self.batch_start = self.batch_start + (
- time.time() - self.maybe_log_save_evaluate_start)
+ self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
ips = logs["interval_steps_per_second"] * args.train_batch_size
avg_batch_cost = 1 / logs["interval_steps_per_second"]
logger.info(
@@ -175,14 +179,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
self.reader_cost_avg.get_average(),
avg_batch_cost,
args.train_batch_size,
- ips, ))
+ ips,
+ )
+ )
self.reader_cost_avg.reset()
def on_epoch_end(self, args, state, control, **kwargs):
if self.benchmark:
train_epoch_cost = time.time() - self.epoch_start
- logger.info("train epoch: %d, epoch_cost: %.5f s" %
- (state.epoch, train_epoch_cost))
+ logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
# register visualdl_with_image
@@ -196,7 +201,9 @@ def __init__(self, **kwargs):
self.add_callback(
BenchmarkCallback(
benchmark=self.args.benchmark,
- profiler_options=self.args.profiler_options, ))
+ profiler_options=self.args.profiler_options,
+ )
+ )
if self.args.benchmark:
if self.args.disable_tqdm:
self.pop_callback(PrinterCallback)
@@ -215,6 +222,7 @@ def get_train_dataloader(self):
self.train_dataset,
batch_size=self.args.train_batch_size,
num_workers=self.args.dataloader_num_workers,
- worker_init_fn=worker_init_fn, )
+ worker_init_fn=worker_init_fn,
+ )
else:
return super().get_train_dataloader()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
index 2fe8ba07c5621..5b4bb009920c4 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
@@ -20,9 +20,14 @@
import paddle.nn.functional as F
from paddlenlp.transformers import AutoTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- LDMBertModel, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ LDMBertModel,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.models.attention import AttentionBlock
from ppdiffusers.models.ema import LitEma
from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
@@ -31,15 +36,15 @@
try:
from ppdiffusers.models.attention import SpatialTransformer
except ImportError:
- from ppdiffusers.models.transformer_2d import (Transformer2DModel as
- SpatialTransformer, )
+ from ppdiffusers.models.transformer_2d import (
+ Transformer2DModel as SpatialTransformer,
+ )
import json
from paddlenlp.utils.log import logger
-from ppdiffusers.initializer import (normal_, reset_initialized_parameter,
- zeros_)
+from ppdiffusers.initializer import normal_, reset_initialized_parameter, zeros_
from ppdiffusers.models.resnet import ResnetBlock2D
@@ -55,31 +60,31 @@ def __init__(self, model_args):
# init tokenizer
tokenizer_name_or_path = (
model_args.tokenizer_name
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+ )
self.tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name_or_path,
- model_max_length=model_args.model_max_length)
+ tokenizer_name_or_path, model_max_length=model_args.model_max_length
+ )
# init vae
vae_name_or_path = (
model_args.vae_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "vqvae"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "vqvae")
+ )
self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
freeze_params(self.vae.parameters())
logger.info("Freeze vae parameters!")
if model_args.pretrained_model_name_or_path is None:
assert (
- model_args.text_encoder_config_file is not None and
- model_args.unet_config_file is not None
+ model_args.text_encoder_config_file is not None and model_args.unet_config_file is not None
), "we must supply text_encoder_config_file & unet_config_file"
# init text_encoder
text_encoder_config = read_json(model_args.text_encoder_config_file)
vocab_size = text_encoder_config["vocab_size"]
- max_position_embeddings = text_encoder_config[
- "max_position_embeddings"]
+ max_position_embeddings = text_encoder_config["max_position_embeddings"]
if self.tokenizer.vocab_size != vocab_size:
logger.info(
f"The tokenizer has a vocab size of {self.tokenizer.vocab_size}, while the text encoder has a vocab size of {vocab_size}, we will use {self.tokenizer.vocab_size} as vocab_size!"
@@ -90,24 +95,24 @@ def __init__(self, model_args):
logger.info(
f"The tokenizer's model_max_length {self.tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {self.tokenizer.model_max_length} as max_position_embeddings!"
)
- text_encoder_config[
- "max_position_embeddings"] = self.tokenizer.model_max_length
+ text_encoder_config["max_position_embeddings"] = self.tokenizer.model_max_length
config = LDMBertConfig(**text_encoder_config)
self.text_encoder = LDMBertModel(config)
self.text_encoder_is_pretrained = False
# init unet2d
- self.unet = UNet2DConditionModel(
- **read_json(model_args.unet_config_file))
+ self.unet = UNet2DConditionModel(**read_json(model_args.unet_config_file))
self.unet_is_pretrained = False
else:
# init text_encoder
self.text_encoder = LDMBertModel.from_pretrained(
- model_args.pretrained_model_name_or_path, subfolder="bert")
+ model_args.pretrained_model_name_or_path, subfolder="bert"
+ )
self.text_encoder_is_pretrained = True
# init unet2d
self.unet = UNet2DConditionModel.from_pretrained(
- model_args.pretrained_model_name_or_path, subfolder="unet")
+ model_args.pretrained_model_name_or_path, subfolder="unet"
+ )
self.unet_is_pretrained = True
assert model_args.prediction_type in ["epsilon", "v_prediction"]
@@ -117,9 +122,9 @@ def __init__(self, model_args):
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
- prediction_type=self.prediction_type, )
- self.register_buffer("alphas_cumprod",
- self.noise_scheduler.alphas_cumprod)
+ prediction_type=self.prediction_type,
+ )
+ self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
if model_args.image_logging_steps > 0:
self.eval_scheduler = DDIMScheduler(
@@ -130,7 +135,8 @@ def __init__(self, model_args):
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1,
- prediction_type=self.prediction_type, )
+ prediction_type=self.prediction_type,
+ )
self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
self.init_weights()
self.use_ema = model_args.use_ema
@@ -138,14 +144,14 @@ def __init__(self, model_args):
if self.use_ema:
self.model_ema = LitEma(self.unet)
- if (model_args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
self.unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
# make sure unet text_encoder in train mode, vae in eval mode
self.unet.train()
@@ -153,35 +159,31 @@ def __init__(self, model_args):
self.vae.eval()
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
- def get_velocity(self,
- sample: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor) -> paddle.Tensor:
- sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+ def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -193,10 +195,8 @@ def init_weights(self):
# init text_encoder
if not self.text_encoder_is_pretrained:
reset_initialized_parameter(self.text_encoder)
- normal_(self.text_encoder.embeddings.word_embeddings.weight, 0,
- 0.02)
- normal_(self.text_encoder.embeddings.position_embeddings.weight, 0,
- 0.02)
+ normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02)
+ normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02)
# init unet
if not self.unet_is_pretrained:
reset_initialized_parameter(self.unet)
@@ -243,16 +243,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
if self.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += self.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=noise.dtype)
- timesteps = paddle.randint(0,
- self.noise_scheduler.num_train_timesteps,
- (latents.shape[0], )).astype("int64")
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
+ )
+ timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
+ "int64"
+ )
noisy_latents = self.add_noise(latents, noise, timesteps)
encoder_hidden_states = self.text_encoder(input_ids)[0]
- noise_pred = self.unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if self.prediction_type == "epsilon":
@@ -262,10 +261,7 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
else:
raise ValueError(f"Unknown prediction type {self.prediction_type}")
- loss = (F.mse_loss(
- noise_pred.cast("float32"),
- target.cast("float32"),
- reduction="none").mean([1, 2, 3]).mean())
+ loss = F.mse_loss(noise_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
return loss
@@ -282,19 +278,18 @@ def decode_image(self, pixel_values=None, **kwargs):
@paddle.no_grad()
def log_image(
- self,
- input_ids=None,
- height=256,
- width=256,
- eta=0.0,
- guidance_scale=7.5,
- **kwargs, ):
+ self,
+ input_ids=None,
+ height=256,
+ width=256,
+ eta=0.0,
+ guidance_scale=7.5,
+ **kwargs,
+ ):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log 8 image
if input_ids.shape[0] > 8:
input_ids = input_ids[:8]
@@ -308,43 +303,34 @@ def log_image(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings], axis=0)
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
- latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
- height // 8, width // 8))
+ latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
# ddim donot use this
latents = latents * self.eval_scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for t in self.eval_scheduler.timesteps:
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# ddim donot use this
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = 1 / 0.18215 * latents
image = self.vae.decode(latents).sample
@@ -356,12 +342,10 @@ def fn(layer):
# ldmbert
if hasattr(layer, "enable_recompute"):
layer.enable_recompute = value
- print("Set", layer.__class__, "recompute",
- layer.enable_recompute)
+ print("Set", layer.__class__, "recompute", layer.enable_recompute)
# unet
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
self.apply(fn)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
index 82d71e6c5f816..b41f0b799469f 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
@@ -46,8 +46,7 @@ def parse_src(filename):
elif data_source == "laion_aes":
text_json = json.loads(vec[2])
img_b64 = vec[5]
- caption = text_json.get("caption_en",
- text_json.get("blip_caption_en", ""))
+ caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
else:
_, captions, _, _, _, img_b64 = vec[:6]
caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -77,23 +76,26 @@ def _get_param(self, img, output_size):
class TextImagePair(IterableDataset):
def __init__(
- self,
- file_list,
- size,
- num_records,
- image_processing=None,
- buffer_size=1000,
- shuffle_every_n_samples=5,
- interpolation="lanczos",
- tokenizer=None, ):
+ self,
+ file_list,
+ size,
+ num_records,
+ image_processing=None,
+ buffer_size=1000,
+ shuffle_every_n_samples=5,
+ interpolation="lanczos",
+ tokenizer=None,
+ ):
self.size = size
if image_processing is None:
- self.image_processing = transforms.Compose([
- transforms.Resize(int(size / 0.9), interpolation),
- RandomCrop(size),
- transforms.ToTensor(),
- transforms.Normalize(0.5, 0.5),
- ])
+ self.image_processing = transforms.Compose(
+ [
+ transforms.Resize(int(size / 0.9), interpolation),
+ RandomCrop(size),
+ transforms.ToTensor(),
+ transforms.Normalize(0.5, 0.5),
+ ]
+ )
else:
self.image_processing = image_processing
self.text_processing = lambda caption: tokenizer(
@@ -101,7 +103,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids[0]
+ return_tensors="pd",
+ ).input_ids[0]
self.file_list = []
file_weights = []
with open(file_list, "r") as f:
@@ -122,19 +125,14 @@ def __init__(
file_weights = file_weights / file_weight_sum
print(f"sample weights of files: {file_weights}")
self.file_weights_cumsum = np.cumsum(file_weights)
- self.file_weights_cumsum = np.concatenate(
- [[0.0], self.file_weights_cumsum])
+ self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
else:
print("sample each file list with same probabiliy")
self.file_weights_cumsum = None
self.num_records = num_records
- self.file_ids = [
- np.arange(len(filelist)) for filelist in self.file_list
- ]
- print(
- f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
- )
+ self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+ print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
self.buffer_size = buffer_size
self.shuffle_every_n_samples = shuffle_every_n_samples
@@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames):
random.shuffle(file_ids)
for i in file_ids:
filename = filenames[i].strip("\n")
- with gzip.open(filename,
- "rb") if filename.endswith(".gz") else open(
- filename, "rb") as f:
+ with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
# retry = 0
while True:
line = f.readline()
@@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames):
if w < self.size or h < self.size:
continue
yield {
- "pixel_values":
- self.image_processing(data["image"]),
- "input_ids":
- self.text_processing(data["caption"]),
+ "pixel_values": self.image_processing(data["image"]),
+ "input_ids": self.text_processing(data["caption"]),
}
def random_load_from_multi_dataset(self):
- print(
- f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
- )
+ print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
sample_loader_per_dataset = [
- iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
- for i in range(len(self.file_ids))
+ iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
]
while True:
@@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self):
else:
rand_num = random.random()
for i in range(len(self.file_list)):
- if (self.file_weights_cumsum[i] <= rand_num <
- self.file_weights_cumsum[i + 1]):
+ if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
break
sample_loader = sample_loader_per_dataset[i]
# debug
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
index c3249e9caca29..d3da3f1f9d187 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
@@ -26,10 +26,16 @@
)
from paddlenlp.transformers import BertTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- EulerAncestralDiscreteScheduler, LDMBertModel,
- LDMTextToImagePipeline, LMSDiscreteScheduler,
- PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LDMBertModel,
+ LDMTextToImagePipeline,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
paddle.set_device("cpu")
@@ -59,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -76,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -119,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -128,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming
to them. It splits attention layers, and takes into account additional replacements
that may arise.
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -150,13 +152,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = old_tensor.split(channels // num_heads, dim=1)
checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -167,8 +167,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -178,8 +177,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -206,25 +204,19 @@ def create_unet_diffusers_config(original_config):
"""
unet_params = original_config.model.params.unet_config.params
- block_out_channels = [
- unet_params.model_channels * mult for mult in unet_params.channel_mult
- ]
+ block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
down_block_types = []
resolution = 1
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnDownBlock2D"
- if resolution in unet_params.attention_resolutions else
- "DownBlock2D")
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
down_block_types.append(block_type)
if i != len(block_out_channels) - 1:
resolution *= 2
up_block_types = []
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnUpBlock2D"
- if resolution in unet_params.attention_resolutions else
- "UpBlock2D")
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
up_block_types.append(block_type)
resolution //= 2
@@ -237,7 +229,8 @@ def create_unet_diffusers_config(original_config):
block_out_channels=tuple(block_out_channels),
layers_per_block=unet_params.num_res_blocks,
cross_attention_dim=unet_params.context_dim,
- attention_head_dim=unet_params.num_heads, )
+ attention_head_dim=unet_params.num_heads,
+ )
return config
@@ -261,14 +254,12 @@ def create_vae_diffusers_config(original_config):
up_block_types=tuple(up_block_types),
block_out_channels=tuple(block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
-def convert_ldm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -289,8 +280,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
print(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -303,17 +293,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint = {}
- new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
- "time_embed.0.weight"]
- new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
- "time_embed.0.bias"]
- new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
- "time_embed.2.weight"]
- new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
- "time_embed.2.bias"]
-
- new_checkpoint["conv_in.weight"] = unet_state_dict[
- "input_blocks.0.0.weight"]
+ new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+ new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+ new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+ new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+ new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -322,35 +307,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
# Retrieves the keys for the input blocks only
- num_input_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "input_blocks" in layer
- })
+ num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
input_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
for layer_id in range(num_input_blocks)
}
# Retrieves the keys for the middle blocks only
- num_middle_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "middle_block" in layer
- })
+ num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
middle_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
for layer_id in range(num_middle_blocks)
}
# Retrieves the keys for the output blocks only
- num_output_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "output_blocks" in layer
- })
+ num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
output_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
for layer_id in range(num_output_blocks)
}
@@ -359,21 +332,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
resnets = [
- key for key in input_blocks[i]
- if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
- key
- ]
- attentions = [
- key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+ key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
]
+ attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.weight")
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.bias")
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.weight"
+ )
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.bias"
+ )
paths = renew_resnet_paths(resnets)
meta_path = {
@@ -385,7 +354,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if len(attentions):
paths = renew_attention_paths(attentions)
@@ -398,19 +368,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
resnet_0 = middle_blocks[0]
attentions = middle_blocks[1]
resnet_1 = middle_blocks[2]
resnet_0_paths = renew_resnet_paths(resnet_0)
- assign_to_checkpoint(
- resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
resnet_1_paths = renew_resnet_paths(resnet_1)
- assign_to_checkpoint(
- resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
attentions_paths = renew_attention_paths(attentions)
meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -419,14 +388,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
for i in range(num_output_blocks):
block_id = i // (config["layers_per_block"] + 1)
layer_in_block_id = i % (config["layers_per_block"] + 1)
- output_block_layers = [
- shave_segments(name, 2) for name in output_blocks[i]
- ]
+ output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
output_block_list = {}
for layer in output_block_layers:
@@ -437,12 +405,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
output_block_list[layer_id] = [layer_name]
if len(output_block_list) > 1:
- resnets = [
- key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
- ]
- attentions = [
- key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
- ]
+ resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+ attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
resnet_0_paths = renew_resnet_paths(resnets)
paths = renew_resnet_paths(resnets)
@@ -456,17 +420,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if ["conv.weight", "conv.bias"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.weight", "conv.bias"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
@@ -476,27 +440,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
paths = renew_attention_paths(attentions)
meta_path = {
"old": f"output_blocks.{i}.1",
- "new":
- f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+ "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
}
assign_to_checkpoint(
paths,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
else:
- resnet_0_paths = renew_resnet_paths(
- output_block_layers, n_shave_prefix_segments=1)
+ resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
for path in resnet_0_paths:
old_path = ".".join(["output_blocks", str(i), path["old"]])
- new_path = ".".join([
- "up_blocks",
- str(block_id),
- "resnets",
- str(layer_in_block_id),
- path["new"],
- ])
+ new_path = ".".join(
+ [
+ "up_blocks",
+ str(block_id),
+ "resnets",
+ str(layer_in_block_id),
+ path["new"],
+ ]
+ )
new_checkpoint[new_path] = unet_state_dict[old_path]
@@ -514,107 +479,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -622,58 +554,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -681,14 +605,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -741,7 +664,8 @@ def create_ldm_bert_config(original_config):
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
- pad_token_id=0, )
+ pad_token_id=0,
+ )
return config
@@ -755,61 +679,56 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
new_checkpoint = {}
- new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
- "transformer.token_emb.weight"].numpy()
- new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
- "transformer.pos_emb.emb.weight"].numpy()
+ new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"].numpy()
+ new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"].numpy()
for i in range(config["encoder_layers"]):
double_i = 2 * i
double_i_plus1 = 2 * i + 1
# convert norm
new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.weight"].numpy()
+ f"transformer.attn_layers.layers.{double_i}.0.weight"
+ ].numpy()
new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.bias"].numpy()
+ f"transformer.attn_layers.layers.{double_i}.0.bias"
+ ].numpy()
new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
- bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t()
- .numpy())
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t().numpy()
+ )
new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
- bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t()
- .numpy())
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t().numpy()
+ )
new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
- bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t()
- .numpy())
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t().numpy()
+ )
new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
- bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"]
- .t().numpy())
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"].numpy(
- )
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].t().numpy()
+ )
+ new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+ ].numpy()
new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"].numpy()
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+ ].numpy()
new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"].numpy()
- new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"]
- .t().numpy())
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+ ].numpy()
+ new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].t().numpy()
+ )
new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"].numpy(
- )
- new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"]
- .t().numpy())
- new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t()
- .numpy())
-
- new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
- "transformer.norm.weight"].numpy()
- new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
- "transformer.norm.bias"].numpy()
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+ ].numpy()
+ new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].t().numpy()
+ )
+ new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (
+ bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t().numpy()
+ )
+
+ new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"].numpy()
+ new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"].numpy()
return new_checkpoint
@@ -822,7 +741,8 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
# wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/configs/latent-diffusion/txt2img-1p4B-eval.yaml
parser.add_argument(
"--original_config_file",
@@ -844,13 +764,15 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
- ), )
+ ),
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
@@ -871,46 +793,40 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
checkpoint,
diffusers_unet_config,
path=args.checkpoint_path,
- extract_ema=args.extract_ema, )
+ extract_ema=args.extract_ema,
+ )
unet = UNet2DConditionModel(**diffusers_unet_config)
- ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- unet, diffusers_unet_checkpoint)
+ ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
check_keys(unet, ppdiffusers_unet_checkpoint)
unet.load_dict(ppdiffusers_unet_checkpoint)
# 2. Convert the VAE model.
vae_config = create_vae_diffusers_config(original_config)
- diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.load_dict(ppdiffusers_vae_checkpoint)
# 3. Convert the text model.
- text_model_type = original_config.model.params.cond_stage_config.target.split(
- ".")[-1]
+ text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
if text_model_type != "BERTEmbedder":
print("We only support BERTEmbedder as text_encoder!")
# 4. Convert the Bert model.
bert_config = create_ldm_bert_config(original_config)
- ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint,
- bert_config)
+ ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint, bert_config)
bert = LDMBertModel(**bert_config)
check_keys(bert, ppdiffusers_bert_checkpoint)
bert.load_dict(ppdiffusers_bert_checkpoint)
# 5. Convert tokenizer.
tokenizer = BertTokenizer.from_pretrained(
- "bert-base-uncased",
- model_max_length=bert_config["max_position_embeddings"])
+ "bert-base-uncased", model_max_length=bert_config["max_position_embeddings"]
+ )
if tokenizer.vocab_size != bert_config["vocab_size"]:
- print(
- "Vocab size mismatched! Please verify your tokenizer or text encoder!"
- )
+ print("Vocab size mismatched! Please verify your tokenizer or text encoder!")
# 6. Convert scheduler.
num_train_timesteps = original_config.model.params.timesteps
@@ -925,17 +841,14 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif args.scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif args.scheduler_type == "euler-ancestral":
scheduler = EulerAncestralDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+ )
elif args.scheduler_type == "ddim":
scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -944,16 +857,11 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
- raise ValueError(
- f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
- pipe = LDMTextToImagePipeline(
- vqvae=vae,
- bert=bert,
- tokenizer=tokenizer,
- unet=unet,
- scheduler=scheduler)
+ raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+ pipe = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
index de9f15339690a..f9e742d3942f6 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
@@ -63,15 +63,13 @@
# loop over resnets/attentions for downblocks
hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
- unet_conversion_map_layer.append(
- (sd_down_res_prefix, hf_down_res_prefix))
+ unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
if i < 3:
# no attention layers in down_blocks.3
hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
- unet_conversion_map_layer.append(
- (sd_down_atn_prefix, hf_down_atn_prefix))
+ unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
for j in range(3):
# loop over resnets/attentions for upblocks
@@ -83,21 +81,18 @@
# no attention layers in up_blocks.0
hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
- unet_conversion_map_layer.append(
- (sd_up_atn_prefix, hf_up_atn_prefix))
+ unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
if i < 3:
# no downsample in down_blocks.3
hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
- unet_conversion_map_layer.append(
- (sd_downsample_prefix, hf_downsample_prefix))
+ unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
# no upsample in up_blocks.3
hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
- unet_conversion_map_layer.append(
- (sd_upsample_prefix, hf_upsample_prefix))
+ unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
hf_mid_atn_prefix = "mid_block.attentions.0."
sd_mid_atn_prefix = "middle_block.1."
@@ -211,8 +206,7 @@ def convert_vae_state_dict(vae_state_dict):
# pretty much a no-op
-def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet,
- ppdiffusers_vae_unet_checkpoint):
+def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, ppdiffusers_vae_unet_checkpoint):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -228,56 +222,63 @@ def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet,
def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
ppdiffusers_mapping_to_orig = {}
+ ppdiffusers_mapping_to_orig["embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight"
ppdiffusers_mapping_to_orig[
- "embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight"
- ppdiffusers_mapping_to_orig[
- "embeddings.position_embeddings.weight"] = "cond_stage_model.transformer.pos_emb.emb.weight"
+ "embeddings.position_embeddings.weight"
+ ] = "cond_stage_model.transformer.pos_emb.emb.weight"
for i in range(num_layers):
double_i = 2 * i
double_i_plus1 = 2 * i + 1
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.norm1.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight"
- ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.norm1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias"
-
- ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
- f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight",
- "transpose", )
- ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
- f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight",
- "transpose", )
+ f"encoder.layers.{i}.norm1.weight"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight"
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
- f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight",
- "transpose", )
+ f"encoder.layers.{i}.norm1.bias"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias"
+
+ ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
+ f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight",
+ "transpose",
+ )
+ ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
+ f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight",
+ "transpose",
+ )
+ ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
+ f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight",
+ "transpose",
+ )
+ ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
+ f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight",
+ "transpose",
+ )
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
- f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight",
- "transpose", )
- ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.self_attn.out_proj.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+ f"encoder.layers.{i}.self_attn.out_proj.bias"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias"
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.norm2.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+ f"encoder.layers.{i}.norm2.weight"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight"
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.norm2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+ f"encoder.layers.{i}.norm2.bias"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias"
ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear1.weight"] = (
f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight",
- "transpose", )
+ "transpose",
+ )
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.linear1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+ f"encoder.layers.{i}.linear1.bias"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear2.weight"] = (
f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight",
- "transpose", )
+ "transpose",
+ )
ppdiffusers_mapping_to_orig[
- f"encoder.layers.{i}.linear2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+ f"encoder.layers.{i}.linear2.bias"
+ ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
- ppdiffusers_mapping_to_orig[
- "final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight"
- ppdiffusers_mapping_to_orig[
- "final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias"
+ ppdiffusers_mapping_to_orig["final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight"
+ ppdiffusers_mapping_to_orig["final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias"
new_state_dict = {}
for k, v in ldmbert_state_dict.items():
@@ -286,18 +287,15 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
if isinstance(new_name, (list, tuple)):
need_transpose = True
new_name = new_name[0]
- new_state_dict[new_name] = (torch.from_numpy(v.t().numpy())
- if need_transpose else
- torch.from_numpy(v.numpy()))
+ new_state_dict[new_name] = torch.from_numpy(v.t().numpy()) if need_transpose else torch.from_numpy(v.numpy())
# dummpy weights, we donot use this!
- new_state_dict[
- "cond_stage_model.transformer.to_logits.weight"] = torch.zeros(
- new_state_dict[
- "cond_stage_model.transformer.token_emb.weight"].shape)
+ new_state_dict["cond_stage_model.transformer.to_logits.weight"] = torch.zeros(
+ new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape
+ )
new_state_dict["cond_stage_model.transformer.to_logits.bias"] = torch.zeros(
- new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[
- 0])
+ new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[0]
+ )
return new_state_dict
@@ -308,43 +306,35 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
default=None,
type=str,
required=True,
- help="Path to the model to convert.", )
+ help="Path to the model to convert.",
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
- parser.add_argument(
- "--half", action="store_true", help="Save weights in half precision.")
+ help="Path to the output model.",
+ )
+ parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
args = parser.parse_args()
pipe = LDMTextToImagePipeline.from_pretrained(args.model_name_or_path)
# Convert the UNet model
- unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(
- pipe.unet, pipe.unet.state_dict())
+ unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.unet, pipe.unet.state_dict())
unet_state_dict = convert_unet_state_dict(unet_state_dict)
- unet_state_dict = {
- "model.diffusion_model." + k: v
- for k, v in unet_state_dict.items()
- }
+ unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
# Convert the VAE model
- vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(
- pipe.vqvae, pipe.vqvae.state_dict())
+ vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.vqvae, pipe.vqvae.state_dict())
vae_state_dict = convert_vae_state_dict(vae_state_dict)
- vae_state_dict = {
- "first_stage_model." + k: v
- for k, v in vae_state_dict.items()
- }
+ vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
# Convert the ldmbert model
- text_enc_dict = convert_ldmbert_state_dict(
- pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"])
+ text_enc_dict = convert_ldmbert_state_dict(pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"])
# Put together new checkpoint
- state_dict = { ** unet_state_dict, ** vae_state_dict, ** text_enc_dict}
+ state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
if args.half:
state_dict = {k: v.half() for k, v in state_dict.items()}
state_dict = {"state_dict": state_dict}
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
index 3ab76ea0ffc2b..6890fae514ab5 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
@@ -41,7 +41,8 @@
linewidth=3,
color="r",
marker="o",
- markerfacecolor="blue", )
+ markerfacecolor="blue",
+)
plt.plot(
clip_pt,
fid_pt,
@@ -49,7 +50,8 @@
linewidth=3,
color="b",
marker="o",
- markerfacecolor="red", )
+ markerfacecolor="red",
+)
plt.xlabel("CLIP Score")
plt.ylabel("FID@1k")
plt.title("12W Globel Step Pareto Curves - DDIM")
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
index 15352e4cd1d5b..4aa3163536c16 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
@@ -21,8 +21,14 @@
import paddle
import paddle.nn as nn
-from ldm import (DataArguments, LatentDiffusionModel, ModelArguments,
- NoTrainerTrainingArguments, TextImagePair, worker_init_fn)
+from ldm import (
+ DataArguments,
+ LatentDiffusionModel,
+ ModelArguments,
+ NoTrainerTrainingArguments,
+ TextImagePair,
+ worker_init_fn,
+)
from paddle.io import DataLoader
from paddle.optimizer import AdamW
from paddlenlp.trainer import PdArgumentParser, set_seed
@@ -47,12 +53,11 @@ def get_writer(training_args):
def main():
- parser = PdArgumentParser(
- (ModelArguments, DataArguments, NoTrainerTrainingArguments))
+ parser = PdArgumentParser((ModelArguments, DataArguments, NoTrainerTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
training_args.image_logging_steps = model_args.image_logging_steps = (
- math.ceil(model_args.image_logging_steps / training_args.logging_steps)
- * training_args.logging_steps)
+ math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+ )
training_args.resolution = data_args.resolution
training_args.print_config(training_args, "Training")
training_args.print_config(model_args, "Model")
@@ -64,8 +69,7 @@ def main():
if num_processes > 1:
paddle.distributed.init_parallel_env()
- training_args.logging_dir = os.path.join(training_args.output_dir,
- training_args.logging_dir)
+ training_args.logging_dir = os.path.join(training_args.output_dir, training_args.logging_dir)
if training_args.seed is not None:
set_seed(training_args.seed)
@@ -75,16 +79,14 @@ def main():
model = LatentDiffusionModel(model_args)
model.set_recompute(training_args.recompute)
- params_to_train = itertools.chain(model.text_encoder.parameters(),
- model.unet.parameters())
+ params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
lr_scheduler = get_scheduler(
training_args.lr_scheduler_type,
learning_rate=training_args.learning_rate,
- num_warmup_steps=training_args.warmup_steps *
- training_args.gradient_accumulation_steps,
- num_training_steps=training_args.max_steps *
- training_args.gradient_accumulation_steps, )
+ num_warmup_steps=training_args.warmup_steps * training_args.gradient_accumulation_steps,
+ num_training_steps=training_args.max_steps * training_args.gradient_accumulation_steps,
+ )
optimizer = AdamW(
learning_rate=lr_scheduler,
@@ -94,8 +96,9 @@ def main():
weight_decay=training_args.weight_decay,
epsilon=training_args.adam_epsilon,
grad_clip=nn.ClipGradByGlobalNorm(training_args.max_grad_norm)
- if training_args.max_grad_norm is not None and
- training_args.max_grad_norm > 0 else None, )
+ if training_args.max_grad_norm is not None and training_args.max_grad_norm > 0
+ else None,
+ )
train_dataset = TextImagePair(
file_list=data_args.file_list,
size=data_args.resolution,
@@ -103,7 +106,8 @@ def main():
buffer_size=data_args.buffer_size,
shuffle_every_n_samples=data_args.shuffle_every_n_samples,
interpolation="lanczos",
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
if num_processes > 1:
model = paddle.DataParallel(model)
@@ -112,28 +116,23 @@ def main():
train_dataset,
batch_size=training_args.per_device_train_batch_size,
num_workers=training_args.dataloader_num_workers,
- worker_init_fn=worker_init_fn, )
+ worker_init_fn=worker_init_fn,
+ )
if rank == 0:
writer = get_writer(training_args)
# Train!
- total_batch_size = (training_args.per_device_train_batch_size *
- num_processes *
- training_args.gradient_accumulation_steps)
+ total_batch_size = (
+ training_args.per_device_train_batch_size * num_processes * training_args.gradient_accumulation_steps
+ )
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {training_args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
- )
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {training_args.gradient_accumulation_steps}"
- )
+ logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {training_args.gradient_accumulation_steps}")
global_steps = 0
tic_train = time.time()
@@ -144,15 +143,13 @@ def main():
break
for step, batch in enumerate(train_dataloader):
- if (num_processes > 1 and (
- (step + 1) % training_args.gradient_accumulation_steps != 0)
- ) or training_args.recompute:
+ if (
+ num_processes > 1 and ((step + 1) % training_args.gradient_accumulation_steps != 0)
+ ) or training_args.recompute:
# grad acc, no_sync when (step + 1) % training_args.gradient_accumulation_steps != 0:
ctx_manager = model.no_sync()
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
with ctx_manager:
loss = model(**batch)
@@ -170,8 +167,7 @@ def main():
# train log
if global_steps % training_args.logging_steps == 0:
logs = {
- "train/loss":
- loss.item() * training_args.gradient_accumulation_steps,
+ "train/loss": loss.item() * training_args.gradient_accumulation_steps,
"train/lr_abs": lr_scheduler.get_lr(),
"train/global_steps": global_steps,
}
@@ -191,48 +187,51 @@ def main():
logger.info(log_str)
if global_steps % training_args.image_logging_steps == 0:
- reconstruction_img = unwrap_model(model).decode_image(
- pixel_values=batch["pixel_values"])
- ddim_10_img = unwrap_model(model).log_image(
- input_ids=batch["input_ids"], guidance_scale=1.0)
- ddim_75_img = unwrap_model(model).log_image(
- input_ids=batch["input_ids"], guidance_scale=7.5)
+ reconstruction_img = unwrap_model(model).decode_image(pixel_values=batch["pixel_values"])
+ ddim_10_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=1.0)
+ ddim_75_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=7.5)
if rank == 0:
writer.add_image(
"reconstruction",
reconstruction_img,
global_steps,
- dataformats="NHWC", )
+ dataformats="NHWC",
+ )
writer.add_image(
"ddim-samples-1.0",
ddim_10_img,
global_steps,
- dataformats="NHWC", )
+ dataformats="NHWC",
+ )
writer.add_image(
"ddim-samples-7.5",
ddim_75_img,
global_steps,
- dataformats="NHWC", )
+ dataformats="NHWC",
+ )
tic_train = time.time()
if rank == 0 and global_steps % training_args.save_steps == 0:
os.makedirs(
- os.path.join(training_args.output_dir,
- f"global-steps-{global_steps}"),
- exist_ok=True, )
+ os.path.join(training_args.output_dir, f"global-steps-{global_steps}"),
+ exist_ok=True,
+ )
paddle.save(
model.state_dict(),
os.path.join(
training_args.output_dir,
f"global-steps-{global_steps}",
- "model_state.pdparams", ), )
+ "model_state.pdparams",
+ ),
+ )
if global_steps >= training_args.max_steps:
break
if rank == 0:
paddle.save(
model.state_dict(),
- os.path.join(training_args.output_dir, "model_state.pdparams"), )
+ os.path.join(training_args.output_dir, "model_state.pdparams"),
+ )
writer.close()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
index d0464a661998f..0125d6fc27e9d 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
@@ -16,16 +16,19 @@
import os
import paddle
-from ldm import (DataArguments, LatentDiffusionModel, LatentDiffusionTrainer,
- ModelArguments, TextImagePair)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
- get_last_checkpoint)
+from ldm import (
+ DataArguments,
+ LatentDiffusionModel,
+ LatentDiffusionTrainer,
+ ModelArguments,
+ TextImagePair,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
from paddlenlp.utils.log import logger
def main():
- parser = PdArgumentParser(
- (ModelArguments, DataArguments, TrainingArguments))
+ parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# report to custom_visualdl
training_args.report_to = ["custom_visualdl"]
@@ -33,9 +36,10 @@ def main():
training_args.benchmark = model_args.benchmark
training_args.profiler_options = model_args.profiler_options
training_args.image_logging_steps = model_args.image_logging_steps = (
- (math.ceil(model_args.image_logging_steps / training_args.logging_steps)
- * training_args.logging_steps)
- if model_args.image_logging_steps > 0 else -1)
+ (math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+ if model_args.image_logging_steps > 0
+ else -1
+ )
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
@@ -44,16 +48,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -67,32 +69,30 @@ def main():
buffer_size=data_args.buffer_size,
shuffle_every_n_samples=data_args.shuffle_every_n_samples,
interpolation="lanczos",
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
if model_args.to_static:
- input_ids = paddle.static.InputSpec(
- name="input_ids",
- shape=[-1, model_args.model_max_length],
- dtype="int64")
+ input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
pixel_values = paddle.static.InputSpec(
name="pixel_values",
shape=[-1, 3, data_args.resolution, data_args.resolution],
- dtype="float32", )
+ dtype="float32",
+ )
specs = [input_ids, pixel_values]
paddle.jit.ignore_module([os])
model = paddle.jit.to_static(model, input_spec=specs)
- logger.info("Successfully to apply @to_static with specs: {}".format(
- specs))
+ logger.info("Successfully to apply @to_static with specs: {}".format(specs))
trainer = LatentDiffusionTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
# must set recompute after trainer init
trainer.model.set_recompute(training_args.recompute)
- params_to_train = itertools.chain(trainer.model.text_encoder.parameters(),
- trainer.model.unet.parameters())
+ params_to_train = itertools.chain(trainer.model.text_encoder.parameters(), trainer.model.unet.parameters())
trainer.set_optimizer_grouped_parameters(params_to_train)
checkpoint = None
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
index e94c83d4ee0af..14468dc73417a 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
@@ -30,16 +30,13 @@ def crop(clip, i, j, h, w):
"""
if len(clip.shape) != 4:
raise ValueError("clip should be a 4D tensor")
- return clip[(...), i:i + h, j:j + w]
+ return clip[(...), i : i + h, j : j + w]
def resize(clip, target_size, interpolation_mode):
if len(target_size) != 2:
- raise ValueError(
- f"target size should be tuple (height, width), instead got {target_size}"
- )
- return paddle.nn.functional.interpolate(
- x=clip, size=target_size, mode=interpolation_mode, align_corners=False)
+ raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+ return paddle.nn.functional.interpolate(x=clip, size=target_size, mode=interpolation_mode, align_corners=False)
def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
@@ -85,8 +82,7 @@ def to_tensor(clip):
"""
_is_tensor_video_clip(clip)
if not clip.dtype == "uint8":
- raise TypeError("clip tensor should have data type uint8. Got %s" %
- str(clip.dtype))
+ raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
return clip.astype(dtype="float32").transpose(perm=[3, 0, 1, 2]) / 255.0
@@ -105,8 +101,7 @@ def normalize(clip, mean, std, inplace=False):
clip = clip.clone()
mean = paddle.to_tensor(data=mean, place=clip.place).astype(clip.dtype)
std = paddle.to_tensor(data=std, place=clip.place).astype(clip.dtype)
- clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, (
- None), (None), (None)])
+ clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, (None), (None), (None)])
return clip
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
index aaaa301718d58..97b39c8cf8f86 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
@@ -44,15 +44,15 @@ def __repr__(self) -> str:
class RandomResizedCropVideo(paddle.vision.transforms.RandomResizedCrop):
def __init__(
- self,
- size,
- scale=(0.08, 1.0),
- ratio=(3.0 / 4.0, 4.0 / 3.0),
- interpolation_mode="bilinear", ):
+ self,
+ size,
+ scale=(0.08, 1.0),
+ ratio=(3.0 / 4.0, 4.0 / 3.0),
+ interpolation_mode="bilinear",
+ ):
if isinstance(size, tuple):
if len(size) != 2:
- raise ValueError(
- f"size should be tuple (height, width), instead got {size}")
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
self.size = size
else:
self.size = size, size
@@ -69,8 +69,7 @@ def __call__(self, clip):
size is (C, T, H, W)
"""
i, j, h, w = self.get_params(clip, self.scale, self.ratio)
- return F.resized_crop(clip, i, j, h, w, self.size,
- self.interpolation_mode)
+ return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode)
def __repr__(self) -> str:
return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})"
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
index c40a946bb1047..e2e940e51fc97 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
@@ -21,6 +21,7 @@
from PIL import Image, ImageFile
from ._transforms_video import CenterCropVideo, RandomCropVideo
+
""" VideoFrameDataset """
ImageFile.LOAD_TRUNCATED_IMAGES = True
IMG_EXTENSIONS = [
@@ -72,9 +73,7 @@ def is_image_file(filename):
def find_classes(dir):
assert os.path.exists(dir), f"{dir} does not exist"
- classes = [
- d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))
- ]
+ classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
@@ -87,10 +86,7 @@ def class_name_to_idx(annotation_dir):
fpath = os.path.join(annotation_dir, "classInd.txt")
with open(fpath, "r") as f:
data = f.readlines()
- class_to_idx = {
- x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1
- for x in data
- }
+ class_to_idx = {x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1 for x in data}
return class_to_idx
@@ -151,8 +147,7 @@ def split_by_captical(s):
return string.rstrip(" ").lower()
-def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
- clip_step=None):
+def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, clip_step=None):
"""
Load consecutive clips and consecutive frames from `dir`.
@@ -181,11 +176,9 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
assert os.path.isdir(video_path)
frames = []
for i, fname in enumerate(sorted(os.listdir(video_path))):
- assert is_image_file(
- fname), f"fname={fname},video_path={video_path},dir={dir}"
+ assert is_image_file(fname), f"fname={fname},video_path={video_path},dir={dir}"
img_path = os.path.join(video_path, fname)
- class_name = video_name.split("_")[
- 1].lower() # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag
+ class_name = video_name.split("_")[1].lower() # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag
class_caption = split_by_captical(
video_name.split("_")[1]
) # v_BoxingSpeedBag_g12_c05 -> BoxingSpeedBag -> boxing speed bag
@@ -201,7 +194,7 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
frames = frames[::frame_stride]
start_indices = list(range(len(frames)))[::clip_step]
for i in start_indices:
- clip = frames[i:i + nframes]
+ clip = frames[i : i + nframes]
if len(clip) == nframes:
clips.append(clip)
return clips, videos
@@ -234,18 +227,19 @@ def load_and_transform_frames(frame_list, loader, img_transform=None):
class VideoFrameDataset(paddle.io.Dataset):
def __init__(
- self,
- data_root,
- resolution,
- video_length,
- dataset_name="",
- subset_split="",
- annotation_dir=None,
- spatial_transform="",
- temporal_transform="",
- frame_stride=1,
- clip_step=None,
- tokenizer=None, ):
+ self,
+ data_root,
+ resolution,
+ video_length,
+ dataset_name="",
+ subset_split="",
+ annotation_dir=None,
+ spatial_transform="",
+ temporal_transform="",
+ frame_stride=1,
+ clip_step=None,
+ tokenizer=None,
+ ):
self.loader = default_loader
self.video_length = video_length
self.subset_split = subset_split
@@ -264,8 +258,7 @@ def __init__(
if annotation_dir is None:
annotation_dir = os.path.join(data_root, "ucfTrainTestlist")
class_to_idx = class_name_to_idx(annotation_dir)
- assert (len(class_to_idx) == 101
- ), f"num of classes = {len(class_to_idx)}, not 101"
+ assert len(class_to_idx) == 101, f"num of classes = {len(class_to_idx)}, not 101"
elif dataset_name == "sky":
classes, class_to_idx = find_classes(video_dir)
else:
@@ -279,9 +272,9 @@ def __init__(
video_length,
class_to_idx,
frame_stride=frame_stride,
- clip_step=clip_step, )
- assert (len(self.clips[0]) == video_length
- ), f"Invalid clip length = {len(self.clips[0])}"
+ clip_step=clip_step,
+ )
+ assert len(self.clips[0]) == video_length, f"Invalid clip length = {len(self.clips[0])}"
if self.temporal_transform == "rand_clips":
self.clips = self.videos
if subset_split == "all":
@@ -296,31 +289,33 @@ def __init__(
print("[VideoFrameDataset] video_length", self.video_length)
if len(self.clips) == 0:
raise RuntimeError(
- f"Found 0 clips in {video_dir}. \nSupported image extensions are: "
- + ",".join(IMG_EXTENSIONS))
- self.img_transform = paddle.vision.transforms.Compose([
- paddle.vision.transforms.ToTensor(),
- paddle.vision.transforms.Normalize((0.5, 0.5, 0.5),
- (0.5, 0.5, 0.5)),
- ])
+ f"Found 0 clips in {video_dir}. \nSupported image extensions are: " + ",".join(IMG_EXTENSIONS)
+ )
+ self.img_transform = paddle.vision.transforms.Compose(
+ [
+ paddle.vision.transforms.ToTensor(),
+ paddle.vision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+ ]
+ )
if self.spatial_transform == "center_crop_resize":
print("Spatial transform: center crop and then resize")
- self.video_transform = paddle.vision.transforms.Compose([
- paddle.vision.transforms.Resize(resolution),
- CenterCropVideo(resolution),
- ])
- self.video_transform_step1 = paddle.vision.transforms.Compose([
- paddle.vision.transforms.Resize(resolution),
- ])
- self.video_transform_step2 = paddle.vision.transforms.Compose(
- [CenterCropVideo(resolution)])
+ self.video_transform = paddle.vision.transforms.Compose(
+ [
+ paddle.vision.transforms.Resize(resolution),
+ CenterCropVideo(resolution),
+ ]
+ )
+ self.video_transform_step1 = paddle.vision.transforms.Compose(
+ [
+ paddle.vision.transforms.Resize(resolution),
+ ]
+ )
+ self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)])
elif self.spatial_transform == "resize":
print("Spatial transform: resize with no crop")
- self.video_transform = paddle.vision.transforms.Resize(
- (resolution, resolution))
+ self.video_transform = paddle.vision.transforms.Resize((resolution, resolution))
elif self.spatial_transform == "random_crop":
- self.video_transform = paddle.vision.transforms.Compose(
- [RandomCropVideo(resolution)])
+ self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)])
elif self.spatial_transform == "":
self.video_transform = None
else:
@@ -332,7 +327,8 @@ def __init__(
padding="max_length",
truncation=True,
max_length=tokenizer.model_max_length,
- return_tensors="np", ).input_ids[0]
+ return_tensors="np",
+ ).input_ids[0]
else:
self.text_processing = None
@@ -340,14 +336,13 @@ def __getitem__(self, index):
if self.temporal_transform == "rand_clips":
raw_video = self.clips[index]
rand_idx = random.randint(0, len(raw_video) - self.video_length)
- clip = raw_video[rand_idx:rand_idx + self.video_length]
+ clip = raw_video[rand_idx : rand_idx + self.video_length]
else:
clip = self.clips[index]
assert (
len(clip) == self.video_length
), f"current clip_length={len(clip)}, target clip_length={self.video_length}, {clip}"
- frames, labels = load_and_transform_frames(clip, self.loader,
- self.img_transform)
+ frames, labels = load_and_transform_frames(clip, self.loader, self.img_transform)
assert (
len(frames) == self.video_length
@@ -357,8 +352,7 @@ def __getitem__(self, index):
if self.spatial_transform == "center_crop_resize":
temp_frames = rearrange(frames, "c t h w -> (c t) h w")
temp_frames = self.video_transform_step1(temp_frames)
- frames = rearrange(
- temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
+ frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
frames = self.video_transform_step2(frames)
else:
frames = self.video_transform(frames)
@@ -377,7 +371,9 @@ def __getitem__(self, index):
"input_ids": self.text_processing(example["caption"]),
}
else:
- tensor_out = {"pixel_values": example["image"], }
+ tensor_out = {
+ "pixel_values": example["image"],
+ }
return tensor_out
def __len__(self):
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
index a4aefa02a1008..e91a6f6018c21 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
@@ -25,100 +25,82 @@ class ModelArguments:
# for initialization
task_type: str = field(
default="short",
- metadata={
- "help":
- "Type of train task. Should be one of ['short', 'text2video']"
- }, )
+ metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"},
+ )
pretrained_model_name_or_path: str = field(
default=None,
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
tokenizer_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"},
+ )
vae_type: str = field(
default="3d",
metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"},
)
vae_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained vae name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"},
+ )
text_encoder_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained text encoder name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"},
+ )
text_encoder_config_file: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Text encoder config file if not use pretrained text encoder"
- }, )
- is_text_encoder_trainable: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ metadata={"help": "Text encoder config file if not use pretrained text encoder"},
+ )
+ is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"})
unet_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained unet name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"},
+ )
unet_config_file: Optional[str] = field(
- default=None,
- metadata={"help": "Unet config file if not use pretrained unet"})
+ default=None, metadata={"help": "Unet config file if not use pretrained unet"}
+ )
scheduler_beta_start: Optional[float] = field(
- default=0.0015,
- metadata={"help": "Train or eval scheduler beta start"})
- scheduler_beta_end: Optional[float] = field(
- default=0.0155, metadata={"help": "Train or eval scheduler beta end"})
+ default=0.0015, metadata={"help": "Train or eval scheduler beta start"}
+ )
+ scheduler_beta_end: Optional[float] = field(default=0.0155, metadata={"help": "Train or eval scheduler beta end"})
scheduler_num_train_timesteps: Optional[int] = field(
default=1000,
metadata={"help": "Train or eval scheduler number of train timesteps"},
)
eval_scheduler_num_inference_steps: Optional[int] = field(
- default=50,
- metadata={"help": "Eval scheduler number of inference timesteps"})
+ default=50, metadata={"help": "Eval scheduler number of inference timesteps"}
+ )
# for training
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable xformers memory efficient attention"})
+ default=False, metadata={"help": "enable xformers memory efficient attention"}
+ )
scale_factor: Optional[float] = field(
default=0.33422927,
- metadata={"help": "The scale factor in the first stage encoding"}, )
+ metadata={"help": "The scale factor in the first stage encoding"},
+ )
shift_factor: Optional[float] = field(
default=1.4606637,
- metadata={"help": "The shift factor in the first stage encoding"}, )
+ metadata={"help": "The shift factor in the first stage encoding"},
+ )
loss_type: str = field(
default="l1",
- metadata={
- "help":
- "The loss type to use in training. Should be one of ['l2', 'l1']"
- }, )
+ metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"},
+ )
# for alignmemnt
latents_path: str = field(
default=None,
- metadata={"help": "Path to latents, used for alignment"}, )
- use_paddle_conv_init: bool = field(
- default=False,
- metadata={"help": "Whether or not use paddle conv2d init"})
+ metadata={"help": "Path to latents, used for alignment"},
+ )
+ use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"})
if_numpy_genarator_random_alignment: bool = field(
default=False,
- metadata={"help": "Whether to align random using numpy generator"}, )
+ metadata={"help": "Whether to align random using numpy generator"},
+ )
numpy_genarator_random_seed: Optional[int] = field(
- default=42, metadata={"help": "The random seed for numpy generator"})
- set_seed_for_alignment: bool = field(
- default=False,
- metadata={"help": "Whether to set seed again for alignment"})
+ default=42, metadata={"help": "The random seed for numpy generator"}
+ )
+ set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"})
@dataclass
@@ -128,8 +110,7 @@ class TrainerArguments:
"""
# for log
- image_logging_steps: Optional[int] = field(
- default=1000, metadata={"help": "Log image every X steps."})
+ image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
@dataclass
@@ -140,28 +121,29 @@ class VideoFrameDatasetArguments:
train_data_root: str = field(
default="/root/data/lvdm/sky",
- metadata={"help": "The root path of train dataset files"}, )
- train_subset_split: str = field(
- default="train", metadata={"help": "The train subset split"})
+ metadata={"help": "The root path of train dataset files"},
+ )
+ train_subset_split: str = field(default="train", metadata={"help": "The train subset split"})
eval_data_root: str = field(
default="/root/data/lvdm/sky",
- metadata={"help": "The root path of validation dataset files"}, )
- eval_subset_split: str = field(
- default="train", metadata={"help": "The validation subset split"})
+ metadata={"help": "The root path of validation dataset files"},
+ )
+ eval_subset_split: str = field(default="train", metadata={"help": "The validation subset split"})
resolution: int = field(
default=256,
- metadata={"help": "The resolution"}, )
+ metadata={"help": "The resolution"},
+ )
video_length: int = field(
default=16,
- metadata={"help": "The video length"}, )
- dataset_name: str = field(
- default="sky", metadata={"help": "The dataset name"})
+ metadata={"help": "The video length"},
+ )
+ dataset_name: str = field(default="sky", metadata={"help": "The dataset name"})
spatial_transform: str = field(
default="center_crop_resize",
- metadata={"help": "The spatial transform type to use"}, )
- temporal_transform: str = field(
- default="rand_clips",
- metadata={"help": "The temporal transform type to use"})
+ metadata={"help": "The spatial transform type to use"},
+ )
+ temporal_transform: str = field(default="rand_clips", metadata={"help": "The temporal transform type to use"})
clip_step: int = field(
default=None,
- metadata={"help": "The clip step"}, )
+ metadata={"help": "The clip step"},
+ )
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
index feb46a5f5e3ad..39000183c6cce 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
@@ -25,100 +25,79 @@ class ModelArguments:
# for initialization
task_type: str = field(
default="text2video",
- metadata={
- "help":
- "Type of train task. Should be one of ['short', 'text2video']"
- }, )
+ metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"},
+ )
pretrained_model_name_or_path: str = field(
default=None,
- metadata={
- "help":
- "Path to pretrained model or model, when we want to resume training."
- }, )
+ metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+ )
tokenizer_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained tokenizer name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"},
+ )
vae_type: str = field(
default="2d",
metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"},
)
vae_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained vae name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"},
+ )
text_encoder_name_or_path: Optional[str] = field(
default="openai/clip-vit-large-patch14",
- metadata={
- "help":
- "Pretrained text encoder name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"},
+ )
text_encoder_config_file: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Text encoder config file if not use pretrained text encoder"
- }, )
- is_text_encoder_trainable: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ metadata={"help": "Text encoder config file if not use pretrained text encoder"},
+ )
+ is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"})
unet_name_or_path: Optional[str] = field(
default=None,
- metadata={
- "help":
- "Pretrained unet name or path if not use pretrained model name or path"
- }, )
+ metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"},
+ )
unet_config_file: Optional[str] = field(
- default=None,
- metadata={"help": "Unet config file if not use pretrained unet"})
+ default=None, metadata={"help": "Unet config file if not use pretrained unet"}
+ )
scheduler_beta_start: Optional[float] = field(
- default=0.00085,
- metadata={"help": "Train or eval scheduler beta start"})
- scheduler_beta_end: Optional[float] = field(
- default=0.012, metadata={"help": "Train or eval scheduler beta end"})
+ default=0.00085, metadata={"help": "Train or eval scheduler beta start"}
+ )
+ scheduler_beta_end: Optional[float] = field(default=0.012, metadata={"help": "Train or eval scheduler beta end"})
scheduler_num_train_timesteps: Optional[int] = field(
default=1000,
metadata={"help": "Train or eval scheduler number of train timesteps"},
)
eval_scheduler_num_inference_steps: Optional[int] = field(
- default=50,
- metadata={"help": "Eval scheduler number of inference timesteps"})
+ default=50, metadata={"help": "Eval scheduler number of inference timesteps"}
+ )
# for training
- use_ema: bool = field(
- default=False, metadata={"help": "Whether or not use ema"})
+ use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
enable_xformers_memory_efficient_attention: bool = field(
- default=False,
- metadata={"help": "enable xformers memory efficient attention"})
+ default=False, metadata={"help": "enable xformers memory efficient attention"}
+ )
scale_factor: Optional[float] = field(
default=0.18215,
- metadata={"help": "The scale factor in the first stage encoding"}, )
- shift_factor: Optional[float] = field(
- default=0,
- metadata={"help": "The shift factor in the first stage encoding"})
+ metadata={"help": "The scale factor in the first stage encoding"},
+ )
+ shift_factor: Optional[float] = field(default=0, metadata={"help": "The shift factor in the first stage encoding"})
loss_type: str = field(
default="l2",
- metadata={
- "help":
- "The loss type to use in training. Should be one of ['l2', 'l1']"
- }, )
+ metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"},
+ )
# for alignmemnt
latents_path: str = field(
default=None,
- metadata={"help": "Path to latents, used for alignment"}, )
- use_paddle_conv_init: bool = field(
- default=False,
- metadata={"help": "Whether or not use paddle conv2d init"})
+ metadata={"help": "Path to latents, used for alignment"},
+ )
+ use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"})
if_numpy_genarator_random_alignment: bool = field(
default=False,
- metadata={"help": "Whether to align random using numpy generator"}, )
+ metadata={"help": "Whether to align random using numpy generator"},
+ )
numpy_genarator_random_seed: Optional[int] = field(
- default=42, metadata={"help": "The random seed for numpy generator"})
- set_seed_for_alignment: bool = field(
- default=False,
- metadata={"help": "Whether to set seed again for alignment"})
+ default=42, metadata={"help": "The random seed for numpy generator"}
+ )
+ set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"})
@dataclass
@@ -128,8 +107,7 @@ class TrainerArguments:
"""
# for log
- image_logging_steps: Optional[int] = field(
- default=1000, metadata={"help": "Log image every X steps."})
+ image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
@dataclass
@@ -140,27 +118,34 @@ class WebVidDatasetArguments:
train_data_root: str = field(
default="/root/data/lvdm/webvid/share_datasets",
- metadata={"help": "The root path of train dataset files"}, )
+ metadata={"help": "The root path of train dataset files"},
+ )
train_annotation_path: str = field(
default="/root/data/lvdm/webvid/share_datasets/train_type_data.list",
- metadata={"help": "The root path of train annotation"}, )
- train_subset_split: str = field(
- default="all", metadata={"help": "The train subset split"})
+ metadata={"help": "The root path of train annotation"},
+ )
+ train_subset_split: str = field(default="all", metadata={"help": "The train subset split"})
eval_data_root: str = field(
default="/root/data/lvdm/webvid/share_datasets",
- metadata={"help": "The root path of validation dataset files"}, )
+ metadata={"help": "The root path of validation dataset files"},
+ )
eval_annotation_path: str = field(
default="/root/data/lvdm/webvid/share_datasets/val_type_data.list",
- metadata={"help": "The root path of validation annotation"}, )
- eval_subset_split: str = field(
- default="all", metadata={"help": "The validation subset split"})
+ metadata={"help": "The root path of validation annotation"},
+ )
+ eval_subset_split: str = field(default="all", metadata={"help": "The validation subset split"})
resolution: int = field(
default=256,
- metadata={"help": "The resolution"}, )
+ metadata={"help": "The resolution"},
+ )
video_length: int = field(
default=16,
- metadata={"help": "The video length"}, )
- frame_stride: int = field(default=4, )
+ metadata={"help": "The video length"},
+ )
+ frame_stride: int = field(
+ default=4,
+ )
spatial_transform: str = field(
default="center_crop_resize",
- metadata={"help": "The spatial transform type to use"}, )
+ metadata={"help": "The spatial transform type to use"},
+ )
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
index 9b00773644bbb..a087314494b33 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
@@ -21,23 +21,34 @@
import numpy as np
import paddle
import paddle.nn as nn
-from einops import rearrange, repeat
+from einops import rearrange
from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
from paddlenlp.utils.log import logger
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- LVDMAutoencoderKL, LVDMUNet3DModel,
- is_ppxformers_available)
-from ppdiffusers.initializer import (normal_, reset_initialized_parameter,
- xavier_uniform_, zeros_)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ LVDMAutoencoderKL,
+ LVDMUNet3DModel,
+ is_ppxformers_available,
+)
+from ppdiffusers.initializer import (
+ normal_,
+ reset_initialized_parameter,
+ xavier_uniform_,
+ zeros_,
+)
from ppdiffusers.models.ema import LitEma
-from ppdiffusers.models.lvdm_attention_temporal import (RelativePosition,
- TemporalCrossAttention)
+from ppdiffusers.models.lvdm_attention_temporal import (
+ RelativePosition,
+ TemporalCrossAttention,
+)
from ppdiffusers.models.lvdm_distributions import DiagonalGaussianDistribution
from ppdiffusers.training_utils import freeze_params
-def set_seed(seed: int=1234, args=None):
+def set_seed(seed: int = 1234, args=None):
if args is None:
random.seed(seed)
np.random.seed(seed)
@@ -45,16 +56,14 @@ def set_seed(seed: int=1234, args=None):
if args is not None:
if args.use_hybrid_parallel:
- from paddle.distributed.fleet.meta_parallel import \
- get_rng_state_tracker
+ from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
random.seed(args.seed + args.dataset_rank)
np.random.seed(args.seed + args.dataset_rank)
paddle.seed(args.seed + args.dataset_rank)
# local_seed/ global_seed is used to control dropout in ModelParallel
- local_seed = (args.seed + 59999 + args.tensor_parallel_rank * 10 +
- args.pipeline_parallel_rank * 1000)
+ local_seed = args.seed + 59999 + args.tensor_parallel_rank * 10 + args.pipeline_parallel_rank * 1000
global_seed = args.seed + 100003 + args.dataset_rank
tracker = get_rng_state_tracker()
@@ -78,12 +87,10 @@ def split_video_to_clips(video, clip_length, drop_left=True):
video_length = video.shape[2]
shape = video.shape
if video_length % clip_length != 0 and drop_left:
- video = video[:, :, :video_length // clip_length * clip_length, :, :]
- print(
- f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
+ video = video[:, :, : video_length // clip_length * clip_length, :, :]
+ print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
nclips = video_length // clip_length
- clips = rearrange(
- video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
+ clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
return clips
@@ -104,17 +111,17 @@ def __init__(self, model_args):
if model_args.task_type == "text2video":
tokenizer_name_or_path = (
model_args.tokenizer_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path,
- "tokenizer"))
- self.tokenizer = AutoTokenizer.from_pretrained(
- tokenizer_name_or_path)
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+ )
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
# init vae
vae_name_or_path = (
model_args.vae_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "vae"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "vae")
+ )
self.vae_type = model_args.vae_type
self.encoder_type = model_args.vae_type
if model_args.vae_type == "2d":
@@ -122,7 +129,7 @@ def __init__(self, model_args):
elif model_args.vae_type == "3d":
self.vae = LVDMAutoencoderKL.from_pretrained(vae_name_or_path)
else:
- raise ValueError(f"`vae_type` to be `2d` or `3d`.")
+ raise ValueError("`vae_type` to be `2d` or `3d`.")
freeze_params(self.vae.parameters())
logger.info("Freeze vae parameters!")
@@ -130,16 +137,14 @@ def __init__(self, model_args):
if model_args.task_type == "text2video":
text_encoder_name_or_path = (
model_args.text_encoder_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path,
- "text_encoder"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+ )
self.text_encoder_is_pretrained = text_encoder_name_or_path is not None
if self.text_encoder_is_pretrained:
- self.text_encoder = CLIPTextModel.from_pretrained(
- text_encoder_name_or_path)
+ self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
else:
- self.text_encoder = CLIPTextModel(
- **read_json(model_args.text_encoder_config_file))
+ self.text_encoder = CLIPTextModel(**read_json(model_args.text_encoder_config_file))
self.init_text_encoder_weights()
if not model_args.is_text_encoder_trainable:
freeze_params(self.text_encoder.parameters())
@@ -148,14 +153,14 @@ def __init__(self, model_args):
# init unet
unet_name_or_path = (
model_args.unet_name_or_path
- if model_args.pretrained_model_name_or_path is None else
- os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+ if model_args.pretrained_model_name_or_path is None
+ else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+ )
self.unet_is_pretrained = model_args.pretrained_model_name_or_path is not None
if self.unet_is_pretrained:
self.unet = LVDMUNet3DModel.from_pretrained(unet_name_or_path)
else:
- self.unet = LVDMUNet3DModel(
- **read_json(model_args.unet_config_file))
+ self.unet = LVDMUNet3DModel(**read_json(model_args.unet_config_file))
self.init_unet_weights()
# init train scheduler
@@ -163,7 +168,8 @@ def __init__(self, model_args):
beta_start=model_args.scheduler_beta_start,
beta_end=model_args.scheduler_beta_end,
beta_schedule="scaled_linear",
- num_train_timesteps=model_args.scheduler_num_train_timesteps, )
+ num_train_timesteps=model_args.scheduler_num_train_timesteps,
+ )
# init eval scheduler
self.eval_scheduler = DDIMScheduler(
@@ -173,23 +179,23 @@ def __init__(self, model_args):
num_train_timesteps=model_args.scheduler_num_train_timesteps,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
- self.eval_scheduler.set_timesteps(
- model_args.eval_scheduler_num_inference_steps)
+ set_alpha_to_one=False,
+ )
+ self.eval_scheduler.set_timesteps(model_args.eval_scheduler_num_inference_steps)
# set training parameters
self.use_ema = model_args.use_ema
if self.use_ema:
self.model_ema = LitEma(self.unet)
- if (model_args.enable_xformers_memory_efficient_attention and
- is_ppxformers_available()):
+ if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
self.unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
self.scale_factor = model_args.scale_factor
self.shift_factor = model_args.shift_factor
self.loss_type = model_args.loss_type
@@ -198,24 +204,19 @@ def __init__(self, model_args):
self.use_preconfig_latents = False
if model_args.latents_path:
self.use_preconfig_latents = True
- self.register_buffer("preconfig_latents",
- paddle.load(model_args.latents_path))
+ self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path))
- self.if_numpy_genarator_random_alignment = (
- model_args.if_numpy_genarator_random_alignment)
+ self.if_numpy_genarator_random_alignment = model_args.if_numpy_genarator_random_alignment
if self.if_numpy_genarator_random_alignment:
- self.generator = np.random.RandomState(
- model_args.numpy_genarator_random_seed)
+ self.generator = np.random.RandomState(model_args.numpy_genarator_random_seed)
self.set_seed_for_alignment = model_args.set_seed_for_alignment
def init_text_encoder_weights(self):
if not self.text_encoder_is_pretrained:
reset_initialized_parameter(self.text_encoder)
- normal_(self.text_encoder.embeddings.word_embeddings.weight, 0,
- 0.02)
- normal_(self.text_encoder.embeddings.position_embeddings.weight, 0,
- 0.02)
+ normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02)
+ normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02)
def init_unet_weights(self):
if not self.unet_is_pretrained:
@@ -256,9 +257,7 @@ def get_first_stage_encoding(self, encoder_posterior, noise=None):
elif isinstance(encoder_posterior, paddle.Tensor):
z = encoder_posterior
else:
- raise NotImplementedError(
- f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
- )
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
z = self.scale_factor * (z + self.shift_factor)
return z
@@ -291,12 +290,7 @@ def decode(self, z, **kwargs):
return results
@paddle.no_grad()
- def overlapped_decode(self,
- z,
- max_z_t=None,
- overlap_t=2,
- predict_cids=False,
- force_not_quantize=False):
+ def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False):
if max_z_t is None:
max_z_t = z.shape[2]
assert max_z_t > overlap_t
@@ -315,69 +309,56 @@ def overlapped_decode(self,
reses = []
for i, z_ in enumerate(zs):
if i == 0:
- res = self.decode(
- z_, predict_cids,
- force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :]
elif i == len(zs) - 1:
- res = self.decode(
- z_, predict_cids,
- force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
else:
- res = self.decode(z_, predict_cids, force_not_quantize).cpu(
- )[:, :, drop_l_x:max_x_t - drop_r_x, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[
+ :, :, drop_l_x : max_x_t - drop_r_x, :, :
+ ]
reses.append(res)
results = paddle.concat(x=reses, axis=2)
return results
@paddle.no_grad()
- def decode_first_stage_2DAE_video(self,
- z,
- decode_bs=16,
- return_cpu=True,
- **kwargs):
+ def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs):
b, _, t, _, _ = z.shape
z = rearrange(z, "b c t h w -> (b t) c h w")
if decode_bs is None:
results = self.decode(z, **kwargs)
else:
- z = paddle.split(
- x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
if return_cpu:
- results = paddle.concat(
- x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
+ results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
else:
- results = paddle.concat(
- x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
- results = rearrange(
- results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
+ results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
+ results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
return results
@paddle.no_grad()
def decode_latents(
- self,
- z,
- decode_bs=16,
- return_cpu=True,
- bs=None,
- decode_single_video_allframes=False,
- max_z_t=None,
- overlapped_length=0,
- **kwargs, ):
+ self,
+ z,
+ decode_bs=16,
+ return_cpu=True,
+ bs=None,
+ decode_single_video_allframes=False,
+ max_z_t=None,
+ overlapped_length=0,
+ **kwargs,
+ ):
b, _, t, _, _ = z.shape
if self.encoder_type == "2d" and z.dim() == 5:
- return self.decode_first_stage_2DAE_video(
- z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
+ return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
if decode_single_video_allframes:
z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0)
cat_dim = 0
elif max_z_t is not None:
if self.encoder_type == "3d":
- z = paddle.split(
- x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
+ z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
cat_dim = 2
if self.encoder_type == "2d":
- z = paddle.split(
- x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
cat_dim = 0
# elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[
# 2
@@ -410,8 +391,7 @@ def get_loss(self, pred, target, mean=True, mask=None):
if mean:
loss = paddle.nn.functional.mse_loss(target, pred)
else:
- loss = paddle.nn.functional.mse_loss(
- target, pred, reduction="none")
+ loss = paddle.nn.functional.mse_loss(target, pred, reduction="none")
else:
raise NotImplementedError("unknown loss type '{loss_type}'")
if mask is not None:
@@ -438,18 +418,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
self.generator.randint(
0,
self.noise_scheduler.num_train_timesteps,
- size=(latents.shape[0], ), ),
- dtype="int64", )
- noise = paddle.to_tensor(
- self.generator.randn(*latents.shape), dtype="float32")
+ size=(latents.shape[0],),
+ ),
+ dtype="int64",
+ )
+ noise = paddle.to_tensor(self.generator.randn(*latents.shape), dtype="float32")
else:
timesteps = paddle.randint(
- 0, self.noise_scheduler.num_train_timesteps,
- (latents.shape[0], )).astype("int64")
+ 0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)
+ ).astype("int64")
noise = paddle.randn_like(latents)
- noisy_latents = self.noise_scheduler.add_noise(latents, noise,
- timesteps)
+ noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
encoder_hidden_states = None
if self.task_type == "text2video":
encoder_hidden_states = self.text_encoder(input_ids)[0]
@@ -458,7 +438,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
noise_pred = self.unet(
noisy_latents,
timesteps,
- context=encoder_hidden_states, ).sample
+ context=encoder_hidden_states,
+ ).sample
loss = self.get_loss(noise_pred, noise, mean=True)
return loss
@@ -485,20 +466,19 @@ def log_reconstruct_frames(self, pixel_values=None, **kwargs):
@paddle.no_grad()
def log_text2video_sample_frames(
- self,
- input_ids=None,
- height=256,
- width=256,
- eta=1.0,
- guidance_scale=9,
- num_frames=16,
- **kwargs, ):
+ self,
+ input_ids=None,
+ height=256,
+ width=256,
+ eta=1.0,
+ guidance_scale=9,
+ num_frames=16,
+ **kwargs,
+ ):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log 2 video
if input_ids.shape[0] > 2:
input_ids = input_ids[:2]
@@ -512,10 +492,10 @@ def log_text2video_sample_frames(
padding="max_length",
truncation=True,
max_length=max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings], axis=0)
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
if self.use_preconfig_latents:
latents = self.preconfig_latents
else:
@@ -528,36 +508,32 @@ def log_text2video_sample_frames(
]
latents = paddle.randn(shape)
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for t in self.eval_scheduler.timesteps:
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# ddim donot use this
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
- context=text_embeddings, ).sample
+ context=text_embeddings,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
sampled_videos = self.decode_latents(latents)
@@ -574,19 +550,11 @@ def log_text2video_sample_frames(
return videos_frames
@paddle.no_grad()
- def log_short_sample_frames(self,
- height=256,
- width=256,
- eta=0.0,
- guidance_scale=9,
- num_frames=16,
- **kwargs):
+ def log_short_sample_frames(self, height=256, width=256, eta=0.0, guidance_scale=9, num_frames=16, **kwargs):
self.eval()
with self.ema_scope():
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# only log 2 video
batch_size = 2
@@ -602,8 +570,7 @@ def log_short_sample_frames(self,
]
latents = paddle.randn(shape)
- accepts_eta = "eta" in set(
- inspect.signature(self.eval_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
@@ -613,17 +580,16 @@ def log_short_sample_frames(self,
latent_model_input = latents
# ddim donot use this
- latent_model_input = self.eval_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
- t, ).sample
+ t,
+ ).sample
# compute the previous noisy sample x_t -> x_t-1
- latents = self.eval_scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
sampled_videos = self.decode_latents(latents)
@@ -643,7 +609,6 @@ def set_recompute(self, value=False):
def fn(layer):
if hasattr(layer, "gradient_checkpointing"):
layer.gradient_checkpointing = value
- print("Set", layer.__class__, "recompute",
- layer.gradient_checkpointing)
+ print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
self.unet.apply(fn)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
index 90d32ee1eda0b..9fa09eb560f4c 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
@@ -19,8 +19,11 @@
import paddle.amp.auto_cast as autocast
from paddle.io import DataLoader
from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
- VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+ INTEGRATION_TO_CALLBACK,
+ VisualDLCallback,
+ rewrite_logs,
+)
from paddlenlp.trainer.utils.helper import nested_detach
from paddlenlp.utils.log import logger
@@ -39,19 +42,17 @@ def autocast_smart_context_manager(self, args):
"c_softmax_with_cross_entropy",
],
level=args.fp16_opt_level,
- dtype=amp_dtype, )
+ dtype=amp_dtype,
+ )
else:
- ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
return ctx_manager
def on_step_end(self, args, state, control, model=None, **kwargs):
if hasattr(model, "on_train_batch_end"):
model.on_train_batch_end()
- if (args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
control.should_log = True
def on_log(self, args, state, control, logs=None, **kwargs):
@@ -62,27 +63,30 @@ def on_log(self, args, state, control, logs=None, **kwargs):
inputs = kwargs.get("inputs", None)
model = kwargs.get("model", None)
image_logs = {}
- if (inputs is not None and model is not None and
- args.image_logging_steps > 0 and
- state.global_step % args.image_logging_steps == 0):
+ if (
+ inputs is not None
+ and model is not None
+ and args.image_logging_steps > 0
+ and state.global_step % args.image_logging_steps == 0
+ ):
with self.autocast_smart_context_manager(args):
- image_logs["reconstruction"] = model.log_reconstruct_frames(
- pixel_values=inputs["pixel_values"])
+ image_logs["reconstruction"] = model.log_reconstruct_frames(pixel_values=inputs["pixel_values"])
if model.task_type == "text2video":
- image_logs[
- "ddim-samples"] = model.log_text2video_sample_frames(
- input_ids=inputs["input_ids"],
- height=256,
- width=256,
- eta=1.0,
- guidance_scale=9,
- num_frames=16, )
+ image_logs["ddim-samples"] = model.log_text2video_sample_frames(
+ input_ids=inputs["input_ids"],
+ height=256,
+ width=256,
+ eta=1.0,
+ guidance_scale=9,
+ num_frames=16,
+ )
elif model.task_type == "short":
image_logs["ddim-samples"] = model.log_short_sample_frames(
height=256,
width=256,
eta=1.0,
- num_frames=16, )
+ num_frames=16,
+ )
if self.vdl_writer is None:
self._init_summary_writer(args)
@@ -97,11 +101,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of VisualDL's writer.add_scalar() "
- "is incorrect so we dropped this attribute.")
+ "is incorrect so we dropped this attribute."
+ )
# log images
for k, v in image_logs.items():
- self.vdl_writer.add_image(
- k, v, state.global_step, dataformats="NHWC")
+ self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
self.vdl_writer.flush()
@@ -117,43 +121,41 @@ def compute_loss(self, model, inputs, return_outputs=False):
def get_train_dataloader(self):
if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.")
- if isinstance(self.train_dataset, VideoFrameDataset) or isinstance(
- self.train_dataset, WebVidDataset):
+ if isinstance(self.train_dataset, VideoFrameDataset) or isinstance(self.train_dataset, WebVidDataset):
return DataLoader(
self.train_dataset,
batch_size=self.args.train_batch_size,
num_workers=self.args.dataloader_num_workers,
shuffle=True,
worker_init_fn=None,
- collate_fn=None, )
+ collate_fn=None,
+ )
else:
return super().get_train_dataloader()
def prediction_step(
- self,
- model,
- inputs,
- prediction_loss_only,
- ignore_keys, ):
+ self,
+ model,
+ inputs,
+ prediction_loss_only,
+ ignore_keys,
+ ):
if self.args.pipeline_parallel_degree > 1:
# hack for pipeline mode
inputs = self._prepare_inputs(inputs)
- return self.prediction_pipeline_step(
- model, inputs, prediction_loss_only, ignore_keys)
+ return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys)
has_labels = all(inputs.get(k) is not None for k in self.label_names)
inputs = self._prepare_inputs(inputs)
if ignore_keys is None:
if hasattr(self.model, "config"):
- ignore_keys = getattr(self.model.config,
- "keys_to_ignore_at_inference", [])
+ ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
else:
ignore_keys = []
# labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
if has_labels:
- labels = nested_detach(
- tuple(inputs.get(name) for name in self.label_names))
+ labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
if len(labels) == 1:
labels = labels[0]
else:
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
index 345c3311c88cd..b6636d5924fec 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
@@ -38,16 +38,17 @@ class WebVidDataset(paddle.io.Dataset):
"""
def __init__(
- self,
- data_root,
- resolution,
- video_length,
- subset_split,
- frame_stride,
- spatial_transform="",
- load_method="decord",
- annotation_path=None,
- tokenizer=None, ):
+ self,
+ data_root,
+ resolution,
+ video_length,
+ subset_split,
+ frame_stride,
+ spatial_transform="",
+ load_method="decord",
+ annotation_path=None,
+ tokenizer=None,
+ ):
self.annotation_path = annotation_path
self.data_root = data_root
self.resolution = resolution
@@ -57,9 +58,7 @@ def __init__(
self.spatial_transform = spatial_transform
self.load_method = load_method
assert self.load_method in ["decord", "readvideo", "videoclips"]
- assert self.subset_split in [
- "train", "test", "all", "results_10M_train"
- ]
+ assert self.subset_split in ["train", "test", "all", "results_10M_train"]
self.exts = ["avi", "mp4", "webm"]
if isinstance(self.resolution, int):
self.resolution = [self.resolution, self.resolution]
@@ -67,22 +66,23 @@ def __init__(
self.max_resolution = max(self.resolution)
if self.spatial_transform == "center_crop_resize":
print("Spatial transform: center crop and then resize")
- self.video_transform = paddle.vision.transforms.Compose([
- paddle.vision.transforms.Resize(resolution),
- CenterCropVideo(resolution),
- ])
- self.video_transform_step1 = paddle.vision.transforms.Compose([
- paddle.vision.transforms.Resize(resolution),
- ])
- self.video_transform_step2 = paddle.vision.transforms.Compose(
- [CenterCropVideo(resolution)])
+ self.video_transform = paddle.vision.transforms.Compose(
+ [
+ paddle.vision.transforms.Resize(resolution),
+ CenterCropVideo(resolution),
+ ]
+ )
+ self.video_transform_step1 = paddle.vision.transforms.Compose(
+ [
+ paddle.vision.transforms.Resize(resolution),
+ ]
+ )
+ self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)])
elif self.spatial_transform == "resize":
print("Spatial transform: resize with no crop")
- self.video_transform = paddle.vision.transforms.Resize(
- (resolution, resolution))
+ self.video_transform = paddle.vision.transforms.Resize((resolution, resolution))
elif self.spatial_transform == "random_crop":
- self.video_transform = paddle.vision.transforms.Compose(
- [RandomCropVideo(resolution)])
+ self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)])
elif self.spatial_transform == "":
self.video_transform = None
else:
@@ -96,7 +96,8 @@ def __init__(
truncation=True,
max_length=tokenizer.model_max_length,
return_tensors="pd",
- return_overflowing_tokens=False, ).input_ids[0]
+ return_overflowing_tokens=False,
+ ).input_ids[0]
else:
self.text_processing = None
@@ -111,12 +112,9 @@ def _make_dataset(self):
self.annotations = fp.read().splitlines()
else:
self.annotations = sum(
- [
- glob.glob(
- os.path.join(data_folder, "**", f"*.{ext}"),
- recursive=True) for ext in self.exts
- ],
- [], )
+ [glob.glob(os.path.join(data_folder, "**", f"*.{ext}"), recursive=True) for ext in self.exts],
+ [],
+ )
print(f"Number of videos = {len(self.annotations)}")
def get_annotation(self, index):
@@ -140,7 +138,8 @@ def get_data_decord(self, index):
video_path,
ctx=cpu(0),
width=self.max_resolution,
- height=self.max_resolution, )
+ height=self.max_resolution,
+ )
if len(video_reader) < self.video_length:
index += 1
continue
@@ -155,23 +154,20 @@ def get_data_decord(self, index):
rand_idx = random.randint(0, len(all_frames) - self.video_length)
frame_indices = list(range(rand_idx, rand_idx + self.video_length))
frames = video_reader.get_batch(frame_indices)
- assert (frames.shape[0] == self.video_length
- ), f"{len(frames)}, self.video_length={self.video_length}"
- frames = (paddle.to_tensor(data=frames.asnumpy())
- .astype(dtype="float32").transpose(perm=[0, 3, 1, 2]))
+ assert frames.shape[0] == self.video_length, f"{len(frames)}, self.video_length={self.video_length}"
+ frames = paddle.to_tensor(data=frames.asnumpy()).astype(dtype="float32").transpose(perm=[0, 3, 1, 2])
if self.video_transform is not None:
if self.spatial_transform == "center_crop_resize":
temp_frames = rearrange(frames, "c t h w -> (c t) h w")
temp_frames = self.video_transform_step1(temp_frames)
- frames = rearrange(
- temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
+ frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
frames = self.video_transform_step2(frames)
else:
frames = self.video_transform(frames)
frames = frames.transpose(perm=[1, 0, 2, 3]).astype(dtype="float32")
- assert (frames.shape[2] == self.resolution[0] and
- frames.shape[3] == self.resolution[1]
- ), f"frames={frames.shape}, self.resolution={self.resolution}"
+ assert (
+ frames.shape[2] == self.resolution[0] and frames.shape[3] == self.resolution[1]
+ ), f"frames={frames.shape}, self.resolution={self.resolution}"
frames = (frames / 255 - 0.5) * 2
data = {"video": frames, "caption": caption}
@@ -181,7 +177,9 @@ def get_data_decord(self, index):
"input_ids": self.text_processing(data["caption"]),
}
else:
- tensor_out = {"pixel_values": data["video"], }
+ tensor_out = {
+ "pixel_values": data["video"],
+ }
return tensor_out
def get_data_readvideo(self, index):
@@ -215,9 +213,9 @@ def main():
subset_split=subset_split,
frame_stride=frame_stride,
spatial_transform=spatial_transform,
- annotation_path=annotation_path, )
- dataloader = paddle.io.data.DataLoader(
- dataset, batch_size=2, shuffle=False, num_workers=0)
+ annotation_path=annotation_path,
+ )
+ dataloader = paddle.io.data.DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0)
starttime = time.time()
for id, data in enumerate(dataloader):
endtime = time.time()
@@ -227,7 +225,8 @@ def main():
endtime - starttime,
" shape:",
data["video"].shape,
- data["caption"], )
+ data["caption"],
+ )
starttime = endtime
return
diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
index f0ef60f1d4cfd..33a27a91410e8 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
@@ -17,8 +17,7 @@
from ppdiffusers import LVDMUncondPipeline
# 加载模型和scheduler
-pipe = LVDMUncondPipeline.from_pretrained(
- "westfish/lvdm_short_sky_epoch2239_step150079")
+pipe = LVDMUncondPipeline.from_pretrained("westfish/lvdm_short_sky_epoch2239_step150079")
# 执行pipeline进行推理
seed = 1000
@@ -32,4 +31,5 @@
save_dir=".",
save_name="ddim_lvdm_short_sky_epoch2239_step150079",
scale_factor=0.33422927,
- shift_factor=1.4606637, )
+ shift_factor=1.4606637,
+)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
index 520ee5339fbde..bbd9587186d87 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
@@ -17,8 +17,7 @@
from ppdiffusers import LVDMTextToVideoPipeline
# 加载模型和scheduler
-pipe = LVDMTextToVideoPipeline.from_pretrained(
- "westfish/lvdm_text2video_orig_webvid_2m")
+pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
# 执行pipeline进行推理
seed = 2013
@@ -36,4 +35,5 @@
save_name="ddim_lvdm_text_to_video_ucf",
encoder_type="2d",
scale_factor=0.18215,
- shift_factor=0, )
+ shift_factor=0,
+)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
index d562f6ff8b359..2db650c780345 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
@@ -27,13 +27,19 @@
raise ImportError(
"OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`."
)
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL,
- LVDMUncondPipeline, LVDMUNet3DModel, PNDMScheduler)
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LMSDiscreteScheduler,
+ LVDMAutoencoderKL,
+ LVDMUncondPipeline,
+ LVDMUNet3DModel,
+ PNDMScheduler,
+)
paddle.set_device("cpu")
MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +122,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
FILENAME = f"archive/{file_name}".encode("latin")
padding_size_plus_fbxx = 4 + 14
data_iostream = []
- offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
- FILENAME) + padding_size_plus_fbxx
+ offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
with open(file, "rb") as r:
r.seek(offset)
for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +135,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
return out, offset + len(out)
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
- backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
if isinstance(storage, TensorMeta):
storage.size = size
return storage
@@ -162,7 +166,8 @@ def create_unet_diffusers_config(original_config):
padding_t=unet_params.padding_t,
temporal_length=unet_params.temporal_length,
use_relative_position=unet_params.use_relative_position,
- use_scale_shift_norm=unet_params.use_scale_shift_norm, )
+ use_scale_shift_norm=unet_params.use_scale_shift_norm,
+ )
return config
@@ -181,7 +186,8 @@ def create_lvdm_vae_diffusers_config(original_config):
padding_type=vae_params.encoder.params.padding_type,
double_z=vae_params.encoder.params.double_z,
z_channels=vae_params.encoder.params.z_channels,
- upsample=vae_params.decoder.params.upsample, )
+ upsample=vae_params.decoder.params.upsample,
+ )
return config
@@ -190,14 +196,12 @@ def create_diffusers_schedular(original_config):
num_train_timesteps=original_config.model.params.timesteps,
beta_start=original_config.model.params.linear_start,
beta_end=original_config.model.params.linear_end,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
return schedular
-def convert_lvdm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False):
+def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -218,8 +222,7 @@ def convert_lvdm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
print(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -251,9 +254,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config):
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -275,8 +276,7 @@ def check_keys(model, state_dict):
if k not in state_dict.keys():
missing_keys.append(k)
if list(v.shape) != list(state_dict[k].shape):
- mismatched_keys.append(
- str((k, list(v.shape), list(state_dict[k].shape))))
+ mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape))))
if len(missing_keys):
missing_keys_str = ", ".join(missing_keys)
print(f"{cls_name} Found missing_keys {missing_keys_str}!")
@@ -293,13 +293,15 @@ def check_keys(model, state_dict):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--vae_checkpoint_path",
default=None,
type=str,
required=False,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--original_config_file",
default=None,
@@ -325,13 +327,15 @@ def check_keys(model, state_dict):
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
- ), )
+ ),
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
# image_size = 512
@@ -340,15 +344,13 @@ def check_keys(model, state_dict):
vae_checkpoint = None
if args.vae_checkpoint_path:
- vae_checkpoint = torch.load(
- args.vae_checkpoint_path, map_location="cpu")
+ vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu")
vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint)
original_config = OmegaConf.load(args.original_config_file)
if args.num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = args.num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
num_train_timesteps = original_config.model.params.timesteps
beta_start = original_config.model.params.linear_start
@@ -361,7 +363,8 @@ def check_keys(model, state_dict):
num_train_timesteps=num_train_timesteps,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -377,15 +380,13 @@ def check_keys(model, state_dict):
elif args.scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif args.scheduler_type == "ddim":
scheduler = scheduler
else:
- raise ValueError(
- f"Scheduler of type {args.scheduler_type} doesn't exist!")
+ raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
# 1. Convert the LVDMUNet3DModel model.
diffusers_unet_config = create_unet_diffusers_config(original_config)
@@ -393,26 +394,25 @@ def check_keys(model, state_dict):
checkpoint,
diffusers_unet_config,
path=args.checkpoint_path,
- extract_ema=args.extract_ema, )
+ extract_ema=args.extract_ema,
+ )
unet = LVDMUNet3DModel.from_config(diffusers_unet_config)
- ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- unet, diffusers_unet_checkpoint)
+ ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
check_keys(unet, ppdiffusers_unet_checkpoint)
unet.load_dict(ppdiffusers_unet_checkpoint)
# 2. Convert the LVDMAutoencoderKL model.
vae_config = create_lvdm_vae_diffusers_config(original_config)
- diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(
- checkpoint, vae_checkpoint, vae_config)
+ diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
vae = LVDMAutoencoderKL.from_config(vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.load_dict(ppdiffusers_vae_checkpoint)
pipe = LVDMUncondPipeline(
vae=vae,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
index 0b09aa164dfe5..0662e05b5bcaa 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
@@ -27,13 +27,20 @@
"OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`."
)
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from transformers import CLIPTextModel as HFCLIPTextModel
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL,
- LVDMTextToVideoPipeline, LVDMUNet3DModel, PNDMScheduler)
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LMSDiscreteScheduler,
+ LVDMAutoencoderKL,
+ LVDMTextToVideoPipeline,
+ LVDMUNet3DModel,
+ PNDMScheduler,
+)
paddle.set_device("cpu")
MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
FILENAME = f"archive/{file_name}".encode("latin")
padding_size_plus_fbxx = 4 + 14
data_iostream = []
- offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
- FILENAME) + padding_size_plus_fbxx
+ offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
with open(file, "rb") as r:
r.seek(offset)
for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
return out, offset + len(out)
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
- backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
if isinstance(storage, TensorMeta):
storage.size = size
return storage
@@ -160,8 +165,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -191,8 +195,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -200,12 +203,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming
to them. It splits attention layers, and takes into account additional replacements
@@ -213,9 +217,7 @@ def assign_to_checkpoint(
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -223,13 +225,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = np.split(old_tensor, 3, axis=1)
@@ -241,8 +241,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -252,8 +251,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -297,7 +295,8 @@ def create_unet_diffusers_config(original_config):
kernel_size_t=unet_params.kernel_size_t,
padding_t=unet_params.padding_t,
temporal_length=unet_params.temporal_length,
- use_relative_position=unet_params.use_relative_position, )
+ use_relative_position=unet_params.use_relative_position,
+ )
return config
@@ -321,7 +320,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
up_block_types=tuple(up_block_types),
block_out_channels=tuple(block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
@@ -339,7 +339,8 @@ def create_lvdm_vae_diffusers_config(original_config):
padding_type=vae_params.encoder.params.padding_type,
double_z=vae_params.encoder.params.double_z,
z_channels=vae_params.encoder.params.z_channels,
- upsample=vae_params.decoder.params.upsample, )
+ upsample=vae_params.decoder.params.upsample,
+ )
return config
@@ -348,14 +349,12 @@ def create_diffusers_schedular(original_config):
num_train_timesteps=original_config.model.params.timesteps,
beta_start=original_config.model.params.linear_start,
beta_end=original_config.model.params.linear_end,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
return schedular
-def convert_lvdm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False):
+def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -376,8 +375,7 @@ def convert_lvdm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
print(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -407,107 +405,74 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
# new_checkpoint = vae_state_dict
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -515,58 +480,50 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -574,7 +531,8 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
@@ -595,9 +553,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config):
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -619,8 +575,7 @@ def check_keys(model, state_dict):
if k not in state_dict.keys():
missing_keys.append(k)
elif list(v.shape) != list(state_dict[k].shape):
- mismatched_keys.append(
- str((k, list(v.shape), list(state_dict[k].shape))))
+ mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape))))
if len(missing_keys):
missing_keys_str = ", ".join(missing_keys)
print(f"{cls_name} Found missing_keys {missing_keys_str}!")
@@ -633,7 +588,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
clip = {}
for key in checkpoint.keys():
if key.startswith("cond_stage_model.transformer"):
- clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key]
+ clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
new_model_state = {}
transformers2ppnlp = {
@@ -653,9 +608,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.items():
# step1: ignore position_ids
if any(i in name for i in ignore_value):
@@ -668,16 +621,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
new_model_state[name] = value.cpu().numpy().astype(dtype)
new_config = {
- "max_text_length":
- new_model_state["text_model.positional_embedding.weight"].shape[0],
- "vocab_size":
- new_model_state["text_model.token_embedding.weight"].shape[0],
- "text_embed_dim":
- new_model_state["text_model.token_embedding.weight"].shape[1],
+ "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0],
+ "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0],
+ "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1],
"text_heads": 12,
"text_layers": 12,
"text_hidden_act": "quick_gelu",
@@ -696,19 +646,22 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--vae_checkpoint_path",
default=None,
type=str,
required=False,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--vae_type",
default="2d",
type=str,
required=False,
- help="The type of vae, chosen from [`2d `, `3d`].", )
+ help="The type of vae, chosen from [`2d `, `3d`].",
+ )
parser.add_argument(
"--original_config_file",
default=None,
@@ -734,13 +687,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
- ), )
+ ),
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
image_size = 512
@@ -750,15 +705,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
vae_checkpoint = None
if args.vae_checkpoint_path:
- vae_checkpoint = torch.load(
- args.vae_checkpoint_path, map_location="cpu")
+ vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu")
vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint)
original_config = OmegaConf.load(args.original_config_file)
if args.num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = args.num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
num_train_timesteps = original_config.model.params.timesteps
beta_start = original_config.model.params.linear_start
@@ -771,7 +724,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
num_train_timesteps=num_train_timesteps,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -786,15 +740,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
elif args.scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif args.scheduler_type == "ddim":
scheduler = scheduler
else:
- raise ValueError(
- f"Scheduler of type {args.scheduler_type} doesn't exist!")
+ raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
# 1. Convert the LVDMUNet3DModel model.
diffusers_unet_config = create_unet_diffusers_config(original_config)
@@ -802,46 +754,41 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
checkpoint,
diffusers_unet_config,
path=args.checkpoint_path,
- extract_ema=args.extract_ema, )
+ extract_ema=args.extract_ema,
+ )
unet = LVDMUNet3DModel.from_config(diffusers_unet_config)
- ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- unet, diffusers_unet_checkpoint)
+ ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
check_keys(unet, ppdiffusers_unet_checkpoint)
unet.load_dict(ppdiffusers_unet_checkpoint)
# 2. Convert the AutoencoderKL model.
if args.vae_type == "2d":
- vae_config = create_vae_diffusers_config(
- original_config, image_size=image_size)
- diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(
- checkpoint, vae_checkpoint, vae_config)
+ vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+ diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
vae = AutoencoderKL.from_config(vae_config)
else:
vae_config = create_lvdm_vae_diffusers_config(original_config)
- diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(
- checkpoint, vae_checkpoint, vae_config)
+ diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
vae = LVDMAutoencoderKL.from_config(vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.load_dict(ppdiffusers_vae_checkpoint)
# 3. Convert the text model.
- text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
- checkpoint, dtype="float32")
+ text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32")
text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_config))
text_encoder.eval()
check_keys(text_encoder, text_model_state_dict)
text_encoder.load_dict(text_model_state_dict)
# 4. load tokenizer.
- pp_tokenizer = CLIPTokenizer.from_pretrained(
- "openai/clip-vit-large-patch14")
+ pp_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
pipe = LVDMTextToVideoPipeline(
vae=vae,
text_encoder=text_encoder,
tokenizer=pp_tokenizer,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
index 967fa9cd80f36..2eba6ece4b713 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
@@ -16,34 +16,40 @@
import os
import paddle
-from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer,
- VideoFrameDataset)
-from lvdm.lvdm_args_short import (ModelArguments, TrainerArguments,
- VideoFrameDatasetArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
- get_last_checkpoint)
+from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, VideoFrameDataset
+from lvdm.lvdm_args_short import (
+ ModelArguments,
+ TrainerArguments,
+ VideoFrameDatasetArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
from paddlenlp.utils.log import logger
def main():
- parser = PdArgumentParser((
- ModelArguments,
- VideoFrameDatasetArguments,
- TrainerArguments,
- TrainingArguments, ))
+ parser = PdArgumentParser(
+ (
+ ModelArguments,
+ VideoFrameDatasetArguments,
+ TrainerArguments,
+ TrainingArguments,
+ )
+ )
(
model_args,
data_args,
trainer_args,
- training_args, ) = parser.parse_args_into_dataclasses()
+ training_args,
+ ) = parser.parse_args_into_dataclasses()
# report to custom_visualdl
training_args.report_to = ["custom_visualdl"]
training_args.resolution = data_args.resolution
training_args.image_logging_steps = trainer_args.image_logging_steps = (
- (math.ceil(trainer_args.image_logging_steps /
- training_args.logging_steps) * training_args.logging_steps)
- if trainer_args.image_logging_steps > 0 else -1)
+ (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+ if trainer_args.image_logging_steps > 0
+ else -1
+ )
training_args.print_config(model_args, "Model")
training_args.print_config(trainer_args, "Trainer")
@@ -53,16 +59,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -78,7 +82,8 @@ def main():
subset_split=data_args.train_subset_split,
spatial_transform=data_args.spatial_transform,
clip_step=data_args.clip_step,
- temporal_transform=data_args.temporal_transform, )
+ temporal_transform=data_args.temporal_transform,
+ )
eval_dataset = VideoFrameDataset(
data_root=data_args.eval_data_root,
resolution=data_args.resolution,
@@ -87,13 +92,15 @@ def main():
subset_split=data_args.eval_subset_split,
spatial_transform=data_args.spatial_transform,
clip_step=data_args.clip_step,
- temporal_transform=data_args.temporal_transform, )
+ temporal_transform=data_args.temporal_transform,
+ )
trainer = LatentVideoDiffusionTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
- eval_dataset=eval_dataset, )
+ eval_dataset=eval_dataset,
+ )
# must set recompute after trainer init
trainer.model.set_recompute(training_args.recompute)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
index 4959f59c1b1a6..f7a04f62abb77 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
@@ -16,31 +16,33 @@
import os
import paddle
-from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer,
- WebVidDataset)
-from lvdm.lvdm_args_text2video import (ModelArguments, TrainerArguments,
- WebVidDatasetArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
- get_last_checkpoint)
+from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, WebVidDataset
+from lvdm.lvdm_args_text2video import (
+ ModelArguments,
+ TrainerArguments,
+ WebVidDatasetArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
from paddlenlp.utils.log import logger
def main():
- parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments,
- TrainerArguments, TrainingArguments))
+ parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments, TrainerArguments, TrainingArguments))
(
model_args,
data_args,
trainer_args,
- training_args, ) = parser.parse_args_into_dataclasses()
+ training_args,
+ ) = parser.parse_args_into_dataclasses()
# report to custom_visualdl
training_args.report_to = ["custom_visualdl"]
training_args.resolution = data_args.resolution
training_args.image_logging_steps = trainer_args.image_logging_steps = (
- (math.ceil(trainer_args.image_logging_steps /
- training_args.logging_steps) * training_args.logging_steps)
- if trainer_args.image_logging_steps > 0 else -1)
+ (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+ if trainer_args.image_logging_steps > 0
+ else -1
+ )
training_args.print_config(model_args, "Model")
training_args.print_config(trainer_args, "Trainer")
@@ -50,16 +52,14 @@ def main():
# Detecting last checkpoint.
last_checkpoint = None
- if (os.path.isdir(training_args.output_dir) and training_args.do_train and
- not training_args.overwrite_output_dir):
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(
- os.listdir(training_args.output_dir)) > 0:
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome.")
- elif (last_checkpoint is not None and
- training_args.resume_from_checkpoint is None):
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -74,7 +74,8 @@ def main():
video_length=data_args.video_length,
frame_stride=data_args.frame_stride,
spatial_transform=data_args.spatial_transform,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
eval_dataset = WebVidDataset(
data_root=data_args.eval_data_root,
annotation_path=data_args.eval_annotation_path,
@@ -83,14 +84,16 @@ def main():
video_length=data_args.video_length,
frame_stride=data_args.frame_stride,
spatial_transform=data_args.spatial_transform,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
trainer = LatentVideoDiffusionTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset if training_args.do_eval else None,
- tokenizer=model.tokenizer, )
+ tokenizer=model.tokenizer,
+ )
# must set recompute after trainer init
trainer.model.set_recompute(training_args.recompute)
diff --git a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
index 32134c2808903..26c629dff52ae 100644
--- a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
+++ b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
@@ -29,10 +29,10 @@
import paddle.nn as nn
import paddle.nn.functional as F
from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
- fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
- DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+ fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.vision.transforms import RandomHorizontalFlip
from paddlenlp.trainer import set_seed
@@ -41,27 +41,30 @@
from PIL import Image
from tqdm.auto import tqdm
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
- DPMSolverMultistepScheduler, UNet2DConditionModel,
- is_ppxformers_available)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDPMScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ UNet2DConditionModel,
+ is_ppxformers_available,
+)
from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (freeze_params, unfreeze_params,
- unwrap_model)
+from ppdiffusers.training_utils import freeze_params, unfreeze_params, unwrap_model
from ppdiffusers.utils import PIL_INTERPOLATION, check_min_version
check_min_version("0.16.1")
def url_or_path_join(*path_list):
- return (os.path.join(*path_list)
- if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+ return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-def import_model_class_from_model_name_or_path(
- pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
try:
text_encoder_config = PretrainedConfig.from_pretrained(
- url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+ url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+ )
model_class = text_encoder_config.architectures[0]
except Exception:
model_class = "LDMBertModel"
@@ -70,8 +73,9 @@ def import_model_class_from_model_name_or_path(
return CLIPTextModel
elif model_class == "RobertaSeriesModelWithTransformation":
- from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
return RobertaSeriesModelWithTransformation
elif model_class == "BertModel":
@@ -79,8 +83,9 @@ def import_model_class_from_model_name_or_path(
return BertModel
elif model_class == "LDMBertModel":
- from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+ LDMBertModel,
+ )
return LDMBertModel
else:
@@ -118,25 +123,28 @@ def get_report_to(args):
def save_progress(text_encoder, placeholder_token_ids, args, save_path):
logger.info("Saving embeddings")
learned_embeds = (
- unwrap_model(text_encoder).get_input_embeddings()
- .weight[min(placeholder_token_ids):max(placeholder_token_ids) + 1])
+ unwrap_model(text_encoder)
+ .get_input_embeddings()
+ .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+ )
learned_embeds_dict = {args.placeholder_token: learned_embeds.detach()}
paddle.save(learned_embeds_dict, save_path)
def parse_args():
- parser = argparse.ArgumentParser(
- description="Simple example of a training script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--save_steps",
type=int,
default=500,
- help="Save learned_embeds.pdparams every X updates steps.", )
+ help="Save learned_embeds.pdparams every X updates steps.",
+ )
parser.add_argument(
"--only_save_embeds",
action="store_true",
default=True,
- help="Save only the embeddings for the new concept.", )
+ help="Save only the embeddings for the new concept.",
+ )
parser.add_argument(
"--num_vectors",
type=int,
@@ -161,70 +169,79 @@ def parse_args():
type=str,
default=None,
required=True,
- help="A folder containing the training data.", )
+ help="A folder containing the training data.",
+ )
parser.add_argument(
"--placeholder_token",
type=str,
default=None,
required=True,
- help="A token to use as a placeholder for the concept.", )
+ help="A token to use as a placeholder for the concept.",
+ )
parser.add_argument(
"--initializer_token",
type=str,
default=None,
required=True,
- help="A token to use as initializer word.", )
+ help="A token to use as initializer word.",
+ )
parser.add_argument(
"--learnable_property",
type=str,
default="object",
- help="Choose between 'object' and 'style'", )
+ help="Choose between 'object' and 'style'",
+ )
parser.add_argument(
"--repeats",
type=int,
default=100,
- help="How many times to repeat the training data.", )
+ help="How many times to repeat the training data.",
+ )
parser.add_argument(
"--output_dir",
type=str,
default="text-inversion-model",
help="The output directory where the model predictions and checkpoints will be written.",
)
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--height",
type=int,
default=None,
help=(
"The height for input images, all the images in the train/validation dataset will be resized to this"
- " height"), )
+ " height"
+ ),
+ )
parser.add_argument(
"--width",
type=int,
default=None,
help=(
"The width for input images, all the images in the train/validation dataset will be resized to this"
- " width"), )
+ " width"
+ ),
+ )
parser.add_argument(
"--resolution",
type=int,
default=512,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--center_crop",
action="store_true",
- help="Whether to center crop images before resizing to resolution.", )
+ help="Whether to center crop images before resizing to resolution.",
+ )
parser.add_argument(
"--train_batch_size",
type=int,
default=16,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument("--num_train_epochs", type=int, default=100)
parser.add_argument(
"--max_train_steps",
@@ -261,19 +278,23 @@ def parse_args():
default="constant",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ), )
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--lr_num_cycles",
type=int,
@@ -284,38 +305,39 @@ def parse_args():
"--lr_power",
type=float,
default=1.0,
- help="Power factor of the polynomial scheduler.", )
+ help="Power factor of the polynomial scheduler.",
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.9,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
- parser.add_argument(
- "--adam_weight_decay",
- type=float,
- default=1e-2,
- help="Weight decay to use.")
+ help="The beta2 parameter for the Adam optimizer.",
+ )
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer", )
- parser.add_argument(
- "--max_grad_norm", default=-1, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer",
+ )
+ parser.add_argument("--max_grad_norm", default=-1, type=float, help="Max gradient norm.")
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -328,19 +350,24 @@ def parse_args():
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
- "*output_dir/logs"), )
+ "*output_dir/logs"
+ ),
+ )
parser.add_argument(
"--report_to",
type=str,
default="visualdl",
help=(
'The integration to report the results and logs to. Supported platforms are `"visualdl"`'
- ' (default), `"tensorboard"`.'), )
+ ' (default), `"tensorboard"`.'
+ ),
+ )
parser.add_argument(
"--language",
default="en",
choices=["en", "zh", "zh_en"],
- help="Model language.", )
+ help="Model language.",
+ )
parser.add_argument(
"--validation_prompt",
type=str,
@@ -360,16 +387,15 @@ def parse_args():
help=(
"Run validation every X epochs. Validation consists of running the prompt"
" `args.validation_prompt` multiple times: `args.num_validation_images`"
- " and logging the images."), )
+ " and logging the images."
+ ),
+ )
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
- parser.add_argument(
- "--noise_offset",
- type=float,
- default=0,
- help="The scale of noise offset.")
+ help="Whether or not to use xformers.",
+ )
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
args = parser.parse_args()
@@ -379,9 +405,7 @@ def parse_args():
if args.language == "en":
if "chinese-en" in args.pretrained_model_name_or_path.lower():
args.language = "zh_en"
- logger.info(
- "Detect Chinese-English Model, we will set language to 'zh_en'. "
- )
+ logger.info("Detect Chinese-English Model, we will set language to 'zh_en'. ")
elif "chinese" in args.pretrained_model_name_or_path.lower():
args.language = "zh"
logger.info("Detect Chinese Model, we will set language to 'zh'. ")
@@ -486,19 +510,20 @@ def parse_args():
class TextualInversionDataset(Dataset):
def __init__(
- self,
- data_root,
- tokenizer,
- learnable_property="object", # [object, style]
- height=512,
- width=512,
- repeats=100,
- interpolation="bicubic",
- flip_p=0.5,
- set="train",
- placeholder_token="*",
- center_crop=False,
- language="en", ):
+ self,
+ data_root,
+ tokenizer,
+ learnable_property="object", # [object, style]
+ height=512,
+ width=512,
+ repeats=100,
+ interpolation="bicubic",
+ flip_p=0.5,
+ set="train",
+ placeholder_token="*",
+ center_crop=False,
+ language="en",
+ ):
self.data_root = data_root
self.tokenizer = tokenizer
self.learnable_property = learnable_property
@@ -514,8 +539,7 @@ def __init__(
ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"]
self.image_paths = []
for e in ext:
- self.image_paths.extend(
- glob.glob(os.path.join(data_root, "*." + e)))
+ self.image_paths.extend(glob.glob(os.path.join(data_root, "*." + e)))
self.num_images = len(self.image_paths)
self._length = self.num_images
@@ -562,7 +586,8 @@ def __getitem__(self, i):
padding="max_length",
truncation=True,
max_length=self.tokenizer.model_max_length,
- return_attention_mask=False, ).input_ids
+ return_attention_mask=False,
+ ).input_ids
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
@@ -571,13 +596,12 @@ def __getitem__(self, i):
crop = min(img.shape[0], img.shape[1])
h, w, = (
img.shape[0],
- img.shape[1], )
- img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop
- ) // 2]
+ img.shape[1],
+ )
+ img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
image = Image.fromarray(img)
- image = image.resize(
- (self.width, self.height), resample=self.interpolation)
+ image = image.resize((self.width, self.height), resample=self.interpolation)
image = self.flip_transform(image)
image = np.array(image).astype(np.uint8)
@@ -587,9 +611,7 @@ def __getitem__(self, i):
return example
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -618,16 +640,13 @@ def main():
os.makedirs(args.output_dir, exist_ok=True)
if args.push_to_hub:
if args.hub_model_id is None:
- repo_name = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
create_repo(repo_name, exist_ok=True, token=args.hub_token)
- repo = Repository(
- args.output_dir, clone_from=repo_name, token=args.hub_token)
+ repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
- with open(os.path.join(args.output_dir, ".gitignore"),
- "w+") as gitignore:
+ with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
gitignore.write("step_*\n")
if "epoch_*" not in gitignore:
@@ -638,18 +657,14 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
elif args.pretrained_model_name_or_path:
# support windows "\"
- tokenizer = AutoTokenizer.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+ tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
# Load scheduler and models
- noise_scheduler = DDPMScheduler.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="scheduler")
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
# Add the placeholder token in tokenizer
placeholder_tokens = [args.placeholder_token]
if args.num_vectors < 1:
- raise ValueError(
- f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}"
- )
+ raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
# add dummy tokens for multi-vector
additional_tokens = []
@@ -661,33 +676,28 @@ def main():
if num_added_tokens != args.num_vectors:
raise ValueError(
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
- " `placeholder_token` that is not already in the tokenizer.")
+ " `placeholder_token` that is not already in the tokenizer."
+ )
# Convert the initializer_token, placeholder_token to ids
- initializer_token_ids = tokenizer.encode(
- args.initializer_token, add_special_tokens=False)["input_ids"]
+ initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)["input_ids"]
if len(initializer_token_ids) < 1:
- raise ValueError(
- "The initializer token must be a greater equal than one.")
+ raise ValueError("The initializer token must be a greater equal than one.")
placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
- text_encoder_cls = import_model_class_from_model_name_or_path(
- args.pretrained_model_name_or_path)
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
text_encoder = text_encoder_cls.from_pretrained(
- url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
- text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
- else text_encoder.config.to_dict())
- if (text_config.get("use_attention_mask", None) is not None and
- text_config["use_attention_mask"]):
+ url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+ )
+ text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+ if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
use_attention_mask = True
else:
use_attention_mask = False
- vae = AutoencoderKL.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="vae")
- unet = UNet2DConditionModel.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="unet")
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+ unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
# Resize the token embeddings as we are adding new special tokens to the tokenizer
text_encoder.resize_token_embeddings(len(tokenizer))
@@ -698,8 +708,8 @@ def main():
# we will compute mean
for token_id in placeholder_token_ids:
token_embeds.weight[token_id] = paddle.stack(
- [token_embeds.weight[each]
- for each in initializer_token_ids]).mean(0)
+ [token_embeds.weight[each] for each in initializer_token_ids]
+ ).mean(0)
# Freeze vae and unet
freeze_params(vae.parameters())
@@ -712,14 +722,14 @@ def main():
# unet.enable_gradient_checkpointing()
set_recompute(text_encoder, True)
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
unet.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
train_dataset = TextualInversionDataset(
data_root=args.train_data_dir,
@@ -732,71 +742,66 @@ def main():
center_crop=args.center_crop,
set="train",
language=args.language,
- interpolation="bilinear", )
+ interpolation="bilinear",
+ )
def collate_fn(examples):
input_ids = [example["input_ids"] for example in examples]
- pixel_values = paddle.to_tensor(
- [example["pixel_values"] for example in examples], dtype="float32")
+ pixel_values = paddle.to_tensor([example["pixel_values"] for example in examples], dtype="float32")
input_ids = tokenizer.pad(
- {
- "input_ids": input_ids
- },
+ {"input_ids": input_ids},
padding="max_length",
max_length=tokenizer.model_max_length,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
}
- train_sampler = (DistributedBatchSampler(
- train_dataset, batch_size=args.train_batch_size, shuffle=True)
- if num_processes > 1 else BatchSampler(
- train_dataset,
- batch_size=args.train_batch_size,
- shuffle=True))
+ train_sampler = (
+ DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ if num_processes > 1
+ else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+ )
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
# Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps /
- num_update_steps_per_epoch)
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
if args.scale_lr:
- args.learning_rate = (args.learning_rate *
- args.gradient_accumulation_steps *
- args.train_batch_size * num_processes)
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+ )
# Initialize the lr_scheduler
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps *
- args.gradient_accumulation_steps,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
num_cycles=args.lr_num_cycles,
- power=args.lr_power, )
+ power=args.lr_power,
+ )
# Initialize the optimizer
optimizer = AdamW(
learning_rate=lr_scheduler,
- parameters=text_encoder.get_input_embeddings().parameters(
- ), # only optimize the embeddings
+ parameters=text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings
beta1=args.adam_beta1,
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if num_processes > 1:
text_encoder = paddle.DataParallel(text_encoder)
@@ -809,35 +814,27 @@ def collate_fn(examples):
writer = get_report_to(args)
# Train!
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num batches each epoch = {len(train_dataloader)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not is_main_process)
+ progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
progress_bar.set_description("Train Steps")
global_step = 0
# keep original embeddings as reference
- orig_embeds_params = (
- unwrap_model(text_encoder).get_input_embeddings().weight.clone())
+ orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.clone()
- index_no_updates = paddle.ones((len(tokenizer), ), dtype=paddle.bool)
- index_no_updates[min(placeholder_token_ids):max(placeholder_token_ids) +
- 1] = False
+ index_no_updates = paddle.ones((len(tokenizer),), dtype=paddle.bool)
+ index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
index_no_updates = index_no_updates.cast("int64").sum()
# Keep vae and unet in eval model as we don't train these
vae.eval()
@@ -855,20 +852,19 @@ def collate_fn(examples):
if args.noise_offset:
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
noise += args.noise_offset * paddle.randn(
- (latents.shape[0], latents.shape[1], 1, 1),
- dtype=latents.dtype)
+ (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+ )
batch_size = latents.shape[0]
# Sample a random timestep for each image
- timesteps = paddle.randint(
- 0, noise_scheduler.config.num_train_timesteps,
- (batch_size, )).cast("int64")
+ timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
- if num_processes > 1 and (args.gradient_checkpointing or (
- (step + 1) % args.gradient_accumulation_steps != 0)):
+ if num_processes > 1 and (
+ args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+ ):
# grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
# gradient_checkpointing, no_sync every where
# gradient_checkpointing + grad_acc, no_sync every where
@@ -876,35 +872,29 @@ def collate_fn(examples):
text_encoder_ctx_manager = text_encoder.no_sync()
else:
# unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
- text_encoder_ctx_manager = (contextlib.nullcontext()
- if sys.version_info >= (3, 7) else
- contextlib.suppress())
+ text_encoder_ctx_manager = (
+ contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+ )
with text_encoder_ctx_manager:
# Get the text embedding for conditioning
if use_attention_mask:
- attention_mask = (batch["input_ids"] !=
- tokenizer.pad_token_id).cast("int64")
+ attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
else:
attention_mask = None
- encoder_hidden_states = text_encoder(
- batch["input_ids"], attention_mask=attention_mask)[0]
+ encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
# with unet_ctx_manager:
# Predict the noise or sample
- model_pred = unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise,
- timesteps)
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
- raise ValueError(
- f"Unknown prediction type {noise_scheduler.config.prediction_type}"
- )
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
loss = F.mse_loss(model_pred, target, reduction="mean")
if args.gradient_accumulation_steps > 1:
@@ -914,18 +904,17 @@ def collate_fn(examples):
if (step + 1) % args.gradient_accumulation_steps == 0:
if num_processes > 1 and args.gradient_checkpointing:
fused_allreduce_gradients(
- unwrap_model(text_encoder).get_input_embeddings()
- .parameters(),
- None, )
+ unwrap_model(text_encoder).get_input_embeddings().parameters(),
+ None,
+ )
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
# Let's make sure we don't update any embedding weights besides the newly added token
with paddle.no_grad():
- unwrap_model(text_encoder).get_input_embeddings(
- ).weight[:
- index_no_updates] = orig_embeds_params[:
- index_no_updates]
+ unwrap_model(text_encoder).get_input_embeddings().weight[:index_no_updates] = orig_embeds_params[
+ :index_no_updates
+ ]
progress_bar.update(1)
global_step += 1
@@ -945,19 +934,19 @@ def collate_fn(examples):
if global_step % args.save_steps == 0:
save_path = os.path.join(
args.output_dir,
- f"learned_embeds-steps-{global_step}.pdparams", )
- save_progress(text_encoder, placeholder_token_ids, args,
- save_path)
+ f"learned_embeds-steps-{global_step}.pdparams",
+ )
+ save_progress(text_encoder, placeholder_token_ids, args, save_path)
if global_step >= args.max_train_steps:
break
if is_main_process:
- if (args.validation_prompt is not None and
- epoch % args.validation_epochs == 0):
+ if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
logger.info(
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
- f" {args.validation_prompt}.")
+ f" {args.validation_prompt}."
+ )
# create pipeline
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
@@ -965,29 +954,27 @@ def collate_fn(examples):
tokenizer=tokenizer,
paddle_dtype=paddle_dtype,
safety_checker=None,
- requires_safety_checker=False, )
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
- pipeline.scheduler.config)
+ requires_safety_checker=False,
+ )
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
pipeline.set_progress_bar_config(disable=True)
# run inference
- generator = (paddle.Generator().manual_seed(args.seed)
- if args.seed else None)
+ generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
images = [
pipeline(
args.validation_prompt,
num_inference_steps=25,
- generator=generator, ).images[0]
+ generator=generator,
+ ).images[0]
for _ in range(args.num_validation_images)
]
np_images = np.stack([np.asarray(img) for img in images])
if args.report_to == "tensorboard":
- writer.add_images(
- "test", np_images, epoch, dataformats="NHWC")
+ writer.add_images("test", np_images, epoch, dataformats="NHWC")
else:
- writer.add_image(
- "test", np_images, epoch, dataformats="NHWC")
+ writer.add_image("test", np_images, epoch, dataformats="NHWC")
del pipeline
gc.collect()
@@ -998,9 +985,7 @@ def collate_fn(examples):
if is_main_process:
writer.close()
if args.push_to_hub and args.only_save_embeds:
- logger.warn(
- "Enabling full model saving because --push_to_hub=True was specified."
- )
+ logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
save_full_model = True
else:
save_full_model = not args.only_save_embeds
@@ -1008,17 +993,15 @@ def collate_fn(examples):
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
text_encoder=unwrap_model(text_encoder),
- tokenizer=tokenizer, )
+ tokenizer=tokenizer,
+ )
pipeline.save_pretrained(args.output_dir)
# Save the newly trained embeddings
save_path = os.path.join(args.output_dir, "learned_embeds.pdparams")
save_progress(text_encoder, placeholder_token_ids, args, save_path)
if args.push_to_hub:
- repo.push_to_hub(
- commit_message="End of training",
- blocking=False,
- auto_lfs_prune=True)
+ repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
if __name__ == "__main__":
diff --git a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
index a157a1f5c1f04..80af56cbf7391 100644
--- a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
+++ b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
@@ -73,8 +73,7 @@ def get_report_to(args):
def parse_args():
- parser = argparse.ArgumentParser(
- description="Simple example of a training script.")
+ parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--dataset_name",
type=str,
@@ -83,7 +82,8 @@ def parse_args():
"The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
" dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
" or to a folder containing files that HF Datasets can understand."
- ), )
+ ),
+ )
parser.add_argument(
"--dataset_config_name",
type=str,
@@ -104,7 +104,8 @@ def parse_args():
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
- ), )
+ ),
+ )
parser.add_argument(
"--output_dir",
type=str,
@@ -124,7 +125,9 @@ def parse_args():
default=64,
help=(
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
- " resolution"), )
+ " resolution"
+ ),
+ )
parser.add_argument(
"--center_crop",
default=False,
@@ -132,40 +135,48 @@ def parse_args():
help=(
"Whether to center crop the input images to the resolution. If not set, the images will be randomly"
" cropped. The images will be resized to the resolution first before cropping."
- ), )
+ ),
+ )
parser.add_argument(
"--random_flip",
default=False,
action="store_true",
- help="whether to randomly flip images horizontally", )
+ help="whether to randomly flip images horizontally",
+ )
parser.add_argument(
"--train_batch_size",
type=int,
default=16,
- help="Batch size (per device) for the training dataloader.", )
+ help="Batch size (per device) for the training dataloader.",
+ )
parser.add_argument(
"--eval_batch_size",
type=int,
default=16,
- help="The number of images to generate for evaluation.", )
+ help="The number of images to generate for evaluation.",
+ )
parser.add_argument(
"--dataloader_num_workers",
type=int,
default=0,
help=(
"The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
- " process."), )
+ " process."
+ ),
+ )
parser.add_argument("--num_epochs", type=int, default=100)
parser.add_argument(
"--save_images_epochs",
type=int,
default=10,
- help="How often to save images during training.", )
+ help="How often to save images during training.",
+ )
parser.add_argument(
"--save_model_epochs",
type=int,
default=10,
- help="How often to save the model during training.", )
+ help="How often to save the model during training.",
+ )
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
@@ -184,34 +195,40 @@ def parse_args():
default="cosine",
help=(
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'), )
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
parser.add_argument(
"--lr_warmup_steps",
type=int,
default=500,
- help="Number of steps for the warmup in the lr scheduler.", )
+ help="Number of steps for the warmup in the lr scheduler.",
+ )
parser.add_argument(
"--adam_beta1",
type=float,
default=0.95,
- help="The beta1 parameter for the Adam optimizer.", )
+ help="The beta1 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_beta2",
type=float,
default=0.999,
- help="The beta2 parameter for the Adam optimizer.", )
+ help="The beta2 parameter for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_weight_decay",
type=float,
default=1e-6,
- help="Weight decay magnitude for the Adam optimizer.", )
+ help="Weight decay magnitude for the Adam optimizer.",
+ )
parser.add_argument(
"--adam_epsilon",
type=float,
default=1e-08,
- help="Epsilon value for the Adam optimizer.", )
- parser.add_argument(
- "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ help="Epsilon value for the Adam optimizer.",
+ )
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--use_ema",
action="store_true",
@@ -221,26 +238,31 @@ def parse_args():
"--ema_inv_gamma",
type=float,
default=1.0,
- help="The inverse gamma value for the EMA decay.", )
+ help="The inverse gamma value for the EMA decay.",
+ )
parser.add_argument(
"--ema_power",
type=float,
default=3 / 4,
- help="The power value for the EMA decay.", )
+ help="The power value for the EMA decay.",
+ )
parser.add_argument(
"--ema_max_decay",
type=float,
default=0.9999,
- help="The maximum decay magnitude for EMA.", )
+ help="The maximum decay magnitude for EMA.",
+ )
parser.add_argument(
"--push_to_hub",
action="store_true",
- help="Whether or not to push the model to the Hub.", )
+ help="Whether or not to push the model to the Hub.",
+ )
parser.add_argument(
"--hub_token",
type=str,
default=None,
- help="The token to use to push to the Model Hub.", )
+ help="The token to use to push to the Model Hub.",
+ )
parser.add_argument(
"--hub_model_id",
type=str,
@@ -250,7 +272,8 @@ def parse_args():
parser.add_argument(
"--hub_private_repo",
action="store_true",
- help="Whether or not to create a private repository.", )
+ help="Whether or not to create a private repository.",
+ )
parser.add_argument(
"--logger",
type=str,
@@ -259,14 +282,17 @@ def parse_args():
help=(
"Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
" for experiment tracking and logging of model metrics and model checkpoints"
- ), )
+ ),
+ )
parser.add_argument(
"--logging_dir",
type=str,
default="logs",
help=(
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
- " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."), )
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
parser.add_argument(
"--prediction_type",
type=str,
@@ -283,7 +309,9 @@ def parse_args():
default=500,
help=(
"Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
- " training using `--resume_from_checkpoint`."), )
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
parser.add_argument(
"--checkpoints_total_limit",
type=int,
@@ -291,29 +319,24 @@ def parse_args():
help=(
"Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
" See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
- " for more docs"), )
- parser.add_argument(
- "--seed",
- type=int,
- default=None,
- help="A seed for reproducible training.")
+ " for more docs"
+ ),
+ )
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--enable_xformers_memory_efficient_attention",
action="store_true",
- help="Whether or not to use xformers.", )
+ help="Whether or not to use xformers.",
+ )
args = parser.parse_args()
if args.dataset_name is None and args.train_data_dir is None:
- raise ValueError(
- "You must specify either a dataset name from the hub or a train data directory."
- )
+ raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
return args
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -349,8 +372,7 @@ def save_model_hook(models, weights, output_dir):
def load_model_hook(models, input_dir):
if args.use_ema:
- load_model = EMAModel.from_pretrained(
- os.path.join(input_dir, "unet_ema"), UNet2DModel)
+ load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
ema_model.load_state_dict(load_model.state_dict())
del load_model
@@ -359,8 +381,7 @@ def load_model_hook(models, input_dir):
model = models.pop()
# load ppdiffusers style into model
- load_model = UNet2DModel.from_pretrained(
- input_dir, subfolder="unet")
+ load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
model.register_to_config(**load_model.config)
model.load_state_dict(load_model.state_dict())
@@ -374,21 +395,20 @@ def load_model_hook(models, input_dir):
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO, )
+ level=logging.INFO,
+ )
# Handle the repository creation
if is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
- repo_name = get_full_repo_name(
- Path(args.output_dir).name, token=args.hub_token)
+ repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
create_repo(repo_name, exist_ok=True, token=args.hub_token)
# repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
- with open(os.path.join(args.output_dir, ".gitignore"),
- "w+") as gitignore:
+ with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
gitignore.write("step_*\n")
if "epoch_*" not in gitignore:
@@ -410,14 +430,17 @@ def load_model_hook(models, input_dir):
"DownBlock2D",
"DownBlock2D",
"AttnDownBlock2D",
- "DownBlock2D", ),
+ "DownBlock2D",
+ ),
up_block_types=(
"UpBlock2D",
"AttnUpBlock2D",
"UpBlock2D",
"UpBlock2D",
"UpBlock2D",
- "UpBlock2D", ), )
+ "UpBlock2D",
+ ),
+ )
else:
config = UNet2DModel.load_config(args.model_config_name_or_path)
model = UNet2DModel.from_config(config)
@@ -431,28 +454,30 @@ def load_model_hook(models, input_dir):
inv_gamma=args.ema_inv_gamma,
power=args.ema_power,
model_cls=UNet2DModel,
- model_config=model.config, )
+ model_config=model.config,
+ )
- if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
- ):
+ if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
try:
model.enable_xformers_memory_efficient_attention()
except Exception as e:
logger.warn(
"Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
- f" correctly and a GPU is available: {e}")
+ f" correctly and a GPU is available: {e}"
+ )
# Initialize the scheduler
- accepts_prediction_type = "prediction_type" in set(
- inspect.signature(DDPMScheduler.__init__).parameters.keys())
+ accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
if accepts_prediction_type:
noise_scheduler = DDPMScheduler(
num_train_timesteps=args.ddpm_num_steps,
beta_schedule=args.ddpm_beta_schedule,
- prediction_type=args.prediction_type, )
+ prediction_type=args.prediction_type,
+ )
else:
noise_scheduler = DDPMScheduler(
num_train_timesteps=args.ddpm_num_steps,
- beta_schedule=args.ddpm_beta_schedule, )
+ beta_schedule=args.ddpm_beta_schedule,
+ )
# Get the datasets: you can either provide your own training and evaluation files (see below)
# or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -464,31 +489,30 @@ def load_model_hook(models, input_dir):
args.dataset_name,
args.dataset_config_name,
cache_dir=args.cache_dir,
- split="train", )
+ split="train",
+ )
else:
dataset = load_dataset(
"imagefolder",
data_dir=args.train_data_dir,
cache_dir=args.cache_dir,
- split="train", )
+ split="train",
+ )
# See more about loading custom images at
# Preprocessing the datasets and DataLoaders creation.
- augmentations = transforms.Compose([
- transforms.Resize(
- args.resolution, interpolation="bilinear"),
- transforms.CenterCrop(args.resolution)
- if args.center_crop else transforms.RandomCrop(args.resolution),
- transforms.RandomHorizontalFlip()
- if args.random_flip else transforms.Lambda(lambda x: x),
- transforms.ToTensor(),
- transforms.Normalize([0.5], [0.5]),
- ])
+ augmentations = transforms.Compose(
+ [
+ transforms.Resize(args.resolution, interpolation="bilinear"),
+ transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+ transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
def transform_images(examples):
- images = [
- augmentations(image.convert("RGB")) for image in examples["image"]
- ]
+ images = [augmentations(image.convert("RGB")) for image in examples["image"]]
return {"input": images}
# logger.info(f"Dataset size: {len(dataset)}")
@@ -498,7 +522,8 @@ def transform_images(examples):
dataset,
batch_size=args.train_batch_size,
shuffle=True,
- num_workers=args.dataloader_num_workers, )
+ num_workers=args.dataloader_num_workers,
+ )
if num_processes > 1:
model = paddle.DataParallel(model)
@@ -507,9 +532,9 @@ def transform_images(examples):
lr_scheduler = get_scheduler(
args.lr_scheduler,
learning_rate=args.learning_rate,
- num_warmup_steps=args.lr_warmup_steps *
- args.gradient_accumulation_steps,
- num_training_steps=(len(train_dataloader) * args.num_epochs), )
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=(len(train_dataloader) * args.num_epochs),
+ )
# Initialize the optimizer
optimizer = paddle.optimizer.AdamW(
@@ -519,8 +544,8 @@ def transform_images(examples):
beta2=args.adam_beta2,
weight_decay=args.adam_weight_decay,
epsilon=args.adam_epsilon,
- grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
- if args.max_grad_norm > 0 else None, )
+ grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+ )
if is_main_process:
logger.info("----------- Configuration Arguments -----------")
@@ -530,22 +555,16 @@ def transform_images(examples):
writer = get_report_to(args)
# Prepare everything with our `accelerator`.
- total_batch_size = (args.train_batch_size * num_processes *
- args.gradient_accumulation_steps)
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps)
+ total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
max_train_steps = args.num_epochs * num_update_steps_per_epoch
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(dataset)}")
logger.info(f" Num Epochs = {args.num_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(
- f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {max_train_steps}")
global_step = 0
@@ -554,8 +573,7 @@ def transform_images(examples):
# Train!
for epoch in range(first_epoch, args.num_epochs):
model.train()
- progress_bar = tqdm(
- total=num_update_steps_per_epoch, disable=not is_main_process)
+ progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not is_main_process)
progress_bar.set_description(f"Epoch {epoch}")
for step, batch in enumerate(train_dataloader):
clean_images = batch["input"]
@@ -563,34 +581,30 @@ def transform_images(examples):
noise = paddle.randn(clean_images.shape)
bsz = clean_images.shape[0]
# Sample a random timestep for each image
- timesteps = paddle.randint(
- 0, noise_scheduler.config.num_train_timesteps,
- (bsz, )).cast("int64")
+ timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,)).cast("int64")
# Add noise to the clean images according to the noise magnitude at each timestep
# (this is the forward diffusion process)
- noisy_images = noise_scheduler.add_noise(clean_images, noise,
- timesteps)
+ noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
# Predict the noise residual
model_output = model(noisy_images, timesteps).sample
if args.prediction_type == "epsilon":
- loss = F.mse_loss(model_output,
- noise) # this could have different weights!
+ loss = F.mse_loss(model_output, noise) # this could have different weights!
elif args.prediction_type == "sample":
alpha_t = _extract_into_tensor(
noise_scheduler.alphas_cumprod,
timesteps,
- (clean_images.shape[0], 1, 1, 1), )
+ (clean_images.shape[0], 1, 1, 1),
+ )
snr_weights = alpha_t / (1 - alpha_t)
loss = snr_weights * F.mse_loss(
model_output, clean_images, reduction="none"
) # use SNR weighting from distillation paper
loss = loss.mean()
else:
- raise ValueError(
- f"Unsupported prediction type: {args.prediction_type}")
+ raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
loss.backward()
@@ -607,13 +621,10 @@ def transform_images(examples):
if global_step % args.checkpointing_steps == 0:
if is_main_process:
- save_path = os.path.join(args.output_dir,
- f"checkpoint-{global_step}")
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
if args.use_ema:
- unwrap_model(ema_model).save_pretrained(
- os.path.join(save_path, "unet_ema"))
- unwrap_model(model).save_pretrained(
- os.path.join(save_path, "unet"))
+ unwrap_model(ema_model).save_pretrained(os.path.join(save_path, "unet_ema"))
+ unwrap_model(model).save_pretrained(os.path.join(save_path, "unet"))
logger.info(f"Saved state to {save_path}")
@@ -638,7 +649,8 @@ def transform_images(examples):
ema_model.copy_to(unet.parameters())
pipeline = DDPMPipeline(
unet=unet,
- scheduler=noise_scheduler, )
+ scheduler=noise_scheduler,
+ )
generator = paddle.Generator().manual_seed(0)
# run pipeline in inference (sample random noise and denoise)
@@ -646,7 +658,8 @@ def transform_images(examples):
generator=generator,
batch_size=args.eval_batch_size,
num_inference_steps=args.ddpm_num_inference_steps,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
if args.use_ema:
ema_model.restore(unet.parameters())
@@ -657,13 +670,15 @@ def transform_images(examples):
"test",
images_processed.transpose(0, 3, 1, 2),
epoch,
- dataformats="NHWC", )
+ dataformats="NHWC",
+ )
else:
writer.add_image(
"test",
images_processed.transpose(0, 3, 1, 2),
epoch,
- dataformats="NHWC", )
+ dataformats="NHWC",
+ )
if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
# save the model
@@ -676,7 +691,8 @@ def transform_images(examples):
pipeline = DDPMPipeline(
unet=unet,
- scheduler=noise_scheduler, )
+ scheduler=noise_scheduler,
+ )
pipeline.save_pretrained(args.output_dir)
diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py
index f8c3b7f6ce1f4..f86f792718938 100644
--- a/ppdiffusers/ppdiffusers/__init__.py
+++ b/ppdiffusers/ppdiffusers/__init__.py
@@ -17,13 +17,26 @@
from . import patches
from .configuration_utils import ConfigMixin
from .utils import (
- OptionalDependencyNotAvailable, is_einops_available,
- is_fastdeploy_available, is_inflect_available, is_k_diffusion_available,
- is_k_diffusion_version, is_librosa_available, is_note_seq_available,
- is_paddle_available, is_paddle_version, is_paddlenlp_available,
- is_paddlenlp_version, is_ppxformers_available, is_safetensors_available,
- is_scipy_available, is_torch_available, is_unidecode_available,
- is_visualdl_available, logging)
+ OptionalDependencyNotAvailable,
+ is_einops_available,
+ is_fastdeploy_available,
+ is_inflect_available,
+ is_k_diffusion_available,
+ is_k_diffusion_version,
+ is_librosa_available,
+ is_note_seq_available,
+ is_paddle_available,
+ is_paddle_version,
+ is_paddlenlp_available,
+ is_paddlenlp_version,
+ is_ppxformers_available,
+ is_safetensors_available,
+ is_scipy_available,
+ is_torch_available,
+ is_unidecode_available,
+ is_visualdl_available,
+ logging,
+)
from .version import VERSION as __version__
try:
@@ -41,32 +54,75 @@
from .utils.dummy_paddle_objects import * # noqa F403
else:
from .models import (
- AutoencoderKL, ControlNetModel, LitEma, LVDMAutoencoderKL,
- LVDMUNet3DModel, ModelMixin, MultiAdapter, PriorTransformer, T2IAdapter,
- T5FilmDecoder, Transformer2DModel, UNet1DModel, UNet2DConditionModel,
- UNet2DModel, UNet3DConditionModel, VQModel)
+ AutoencoderKL,
+ ControlNetModel,
+ LitEma,
+ LVDMAutoencoderKL,
+ LVDMUNet3DModel,
+ ModelMixin,
+ MultiAdapter,
+ PriorTransformer,
+ T2IAdapter,
+ T5FilmDecoder,
+ Transformer2DModel,
+ UNet1DModel,
+ UNet2DConditionModel,
+ UNet2DModel,
+ UNet3DConditionModel,
+ VQModel,
+ )
from .optimization import (
- get_constant_schedule, get_constant_schedule_with_warmup,
+ get_constant_schedule,
+ get_constant_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_cosine_with_hard_restarts_schedule_with_warmup,
get_linear_schedule_with_warmup,
- get_polynomial_decay_schedule_with_warmup, get_scheduler)
+ get_polynomial_decay_schedule_with_warmup,
+ get_scheduler,
+ )
from .pipelines import (
- AudioPipelineOutput, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline,
- DiffusionPipeline, DiTPipeline, ImagePipelineOutput, KarrasVePipeline,
- LDMPipeline, LDMSuperResolutionPipeline, PNDMPipeline, RePaintPipeline,
- ScoreSdeVePipeline, TextPipelineOutput)
+ AudioPipelineOutput,
+ DanceDiffusionPipeline,
+ DDIMPipeline,
+ DDPMPipeline,
+ DiffusionPipeline,
+ DiTPipeline,
+ ImagePipelineOutput,
+ KarrasVePipeline,
+ LDMPipeline,
+ LDMSuperResolutionPipeline,
+ PNDMPipeline,
+ RePaintPipeline,
+ ScoreSdeVePipeline,
+ TextPipelineOutput,
+ )
from .schedulers import (
- DDIMInverseScheduler, DDIMScheduler, DDPMScheduler,
- DEISMultistepScheduler, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, DPMSolverUniDiffuserScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, IPNDMScheduler, KarrasVeScheduler,
- KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler, PNDMScheduler,
- RePaintScheduler, SchedulerMixin, ScoreSdeVeScheduler, UnCLIPScheduler,
- UniPCMultistepScheduler, VQDiffusionScheduler)
- from .schedulers.preconfig import (PreconfigEulerAncestralDiscreteScheduler,
- PreconfigLMSDiscreteScheduler)
+ DDIMInverseScheduler,
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ DPMSolverUniDiffuserScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ IPNDMScheduler,
+ KarrasVeScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ PNDMScheduler,
+ RePaintScheduler,
+ SchedulerMixin,
+ ScoreSdeVeScheduler,
+ UnCLIPScheduler,
+ UniPCMultistepScheduler,
+ VQDiffusionScheduler,
+ )
+ from .schedulers.preconfig import (
+ PreconfigEulerAncestralDiscreteScheduler,
+ PreconfigLMSDiscreteScheduler,
+ )
from .training_utils import EMAModel
try:
@@ -84,36 +140,58 @@
from .utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403
else:
from .pipelines import (
- AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AudioLDMPipeline,
- CycleDiffusionPipeline, IFImg2ImgPipeline,
- IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline,
- IFInpaintingSuperResolutionPipeline, IFPipeline,
- IFSuperResolutionPipeline, LDMTextToImagePipeline,
- LVDMTextToVideoPipeline, LVDMUncondPipeline, PaintByExamplePipeline,
- SemanticStableDiffusionPipeline, StableDiffusionAdapterPipeline,
+ AltDiffusionImg2ImgPipeline,
+ AltDiffusionPipeline,
+ AudioLDMPipeline,
+ CycleDiffusionPipeline,
+ IFImg2ImgPipeline,
+ IFImg2ImgSuperResolutionPipeline,
+ IFInpaintingPipeline,
+ IFInpaintingSuperResolutionPipeline,
+ IFPipeline,
+ IFSuperResolutionPipeline,
+ LDMTextToImagePipeline,
+ LVDMTextToVideoPipeline,
+ LVDMUncondPipeline,
+ PaintByExamplePipeline,
+ SemanticStableDiffusionPipeline,
+ StableDiffusionAdapterPipeline,
StableDiffusionAttendAndExcitePipeline,
- StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline,
- StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline,
- StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy,
+ StableDiffusionControlNetPipeline,
+ StableDiffusionDepth2ImgPipeline,
+ StableDiffusionImageVariationPipeline,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionInpaintPipelineLegacy,
StableDiffusionInstructPix2PixPipeline,
- StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline,
- StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline,
- StableDiffusionPipeline, StableDiffusionPipelineAllinOne,
- StableDiffusionPipelineSafe, StableDiffusionPix2PixZeroPipeline,
- StableDiffusionSAGPipeline, StableDiffusionUpscalePipeline,
- StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline,
- TextToVideoSDPipeline, TextToVideoZeroPipeline,
- UnCLIPImageVariationPipeline, UnCLIPPipeline, UniDiffuserPipeline,
+ StableDiffusionLatentUpscalePipeline,
+ StableDiffusionMegaPipeline,
+ StableDiffusionModelEditingPipeline,
+ StableDiffusionPanoramaPipeline,
+ StableDiffusionPipeline,
+ StableDiffusionPipelineAllinOne,
+ StableDiffusionPipelineSafe,
+ StableDiffusionPix2PixZeroPipeline,
+ StableDiffusionSAGPipeline,
+ StableDiffusionUpscalePipeline,
+ StableUnCLIPImg2ImgPipeline,
+ StableUnCLIPPipeline,
+ TextToVideoSDPipeline,
+ TextToVideoZeroPipeline,
+ UnCLIPImageVariationPipeline,
+ UnCLIPPipeline,
+ UniDiffuserPipeline,
VersatileDiffusionDualGuidedPipeline,
- VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline,
- VersatileDiffusionTextToImagePipeline, VQDiffusionPipeline)
- from .pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
+ VersatileDiffusionImageVariationPipeline,
+ VersatileDiffusionPipeline,
+ VersatileDiffusionTextToImagePipeline,
+ VQDiffusionPipeline,
+ )
+ from .pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
from .pipelines.unidiffuser.caption_decoder import CaptionDecoder
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_k_diffusion_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import * # noqa F403
@@ -121,21 +199,22 @@
from .pipelines import StableDiffusionKDiffusionPipeline
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_fastdeploy_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403
else:
- from .pipelines import (FastDeployCycleDiffusionPipeline,
- FastDeployStableDiffusionControlNetPipeline,
- FastDeployStableDiffusionImageVariationPipeline,
- FastDeployStableDiffusionImg2ImgPipeline,
- FastDeployStableDiffusionInpaintPipeline,
- FastDeployStableDiffusionInpaintPipelineLegacy,
- FastDeployStableDiffusionMegaPipeline,
- FastDeployStableDiffusionPipeline,
- FastDeployStableDiffusionUpscalePipeline)
+ from .pipelines import (
+ FastDeployCycleDiffusionPipeline,
+ FastDeployStableDiffusionControlNetPipeline,
+ FastDeployStableDiffusionImageVariationPipeline,
+ FastDeployStableDiffusionImg2ImgPipeline,
+ FastDeployStableDiffusionInpaintPipeline,
+ FastDeployStableDiffusionInpaintPipelineLegacy,
+ FastDeployStableDiffusionMegaPipeline,
+ FastDeployStableDiffusionPipeline,
+ FastDeployStableDiffusionUpscalePipeline,
+ )
try:
if not (is_paddle_available() and is_librosa_available()):
@@ -146,8 +225,7 @@
from .pipelines import AudioDiffusionPipeline, Mel
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_note_seq_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import * # noqa F403
@@ -155,8 +233,7 @@
from .pipelines import SpectrogramDiffusionPipeline
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_einops_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_paddle_and_paddlenlp_and_einops_objects import * # noqa F403
diff --git a/ppdiffusers/ppdiffusers/commands/env.py b/ppdiffusers/ppdiffusers/commands/env.py
index a020de6813b7d..0ad95fd647340 100644
--- a/ppdiffusers/ppdiffusers/commands/env.py
+++ b/ppdiffusers/ppdiffusers/commands/env.py
@@ -57,9 +57,7 @@ def run(self):
"Using distributed or parallel set-up in script?": "
",
}
- print(
- "\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n"
- )
+ print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
print(self.format_dict(info))
return info
diff --git a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
index d14e14711dedc..7575e5902a50e 100644
--- a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
+++ b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
@@ -20,10 +20,8 @@
def main():
- parser = ArgumentParser(
- "PPDiffusers CLI tool", usage="ppdiffusers-cli []")
- commands_parser = parser.add_subparsers(
- help="ppdiffusers-cli command helpers")
+ parser = ArgumentParser("PPDiffusers CLI tool", usage="ppdiffusers-cli []")
+ commands_parser = parser.add_subparsers(help="ppdiffusers-cli command helpers")
# Register commands
EnvironmentCommand.register_subcommand(commands_parser)
diff --git a/ppdiffusers/ppdiffusers/configuration_utils.py b/ppdiffusers/ppdiffusers/configuration_utils.py
index 2c5d4e88c84e7..551fb118afa9e 100644
--- a/ppdiffusers/ppdiffusers/configuration_utils.py
+++ b/ppdiffusers/ppdiffusers/configuration_utils.py
@@ -33,9 +33,16 @@
import numpy as np
import paddle
-from .utils import (DIFFUSERS_CACHE, PPDIFFUSERS_CACHE, DummyObject,
- bos_hf_download, deprecate, extract_commit_hash,
- http_user_agent, logging)
+from .utils import (
+ DIFFUSERS_CACHE,
+ PPDIFFUSERS_CACHE,
+ DummyObject,
+ bos_hf_download,
+ deprecate,
+ extract_commit_hash,
+ http_user_agent,
+ logging,
+)
from .utils.constants import FROM_HF_HUB
from .version import VERSION as __version__
@@ -54,36 +61,25 @@ def __init__(self, *args, **kwargs):
self.__frozen = True
def __delitem__(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
def setdefault(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
def pop(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+ raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
def update(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``update`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
def __setattr__(self, name, value):
if hasattr(self, "__frozen") and self.__frozen:
- raise Exception(
- f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
super().__setattr__(name, value)
def __setitem__(self, name, value):
if hasattr(self, "__frozen") and self.__frozen:
- raise Exception(
- f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
super().__setitem__(name, value)
@@ -112,9 +108,7 @@ class ConfigMixin:
def register_to_config(self, **kwargs):
if self.config_name is None:
- raise NotImplementedError(
- f"Make sure that {self.__class__} has defined a class name `config_name`"
- )
+ raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
# Special case for `kwargs` used in deprecation warning added to schedulers
# TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
# or solve in a more general way.
@@ -124,9 +118,8 @@ def register_to_config(self, **kwargs):
internal_dict = kwargs
else:
previous_dict = dict(self._internal_dict)
- internal_dict = { ** self._internal_dict, ** kwargs}
- logger.debug(
- f"Updating config from {previous_dict} to {internal_dict}")
+ internal_dict = {**self._internal_dict, **kwargs}
+ logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
self._internal_dict = FrozenDict(internal_dict)
@@ -137,8 +130,7 @@ def __getattr__(self, name: str) -> Any:
https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
"""
- is_in_config = "_internal_dict" in self.__dict__ and hasattr(
- self.__dict__["_internal_dict"], name)
+ is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
is_attribute = name in self.__dict__
if is_in_config and not is_attribute:
@@ -147,18 +139,19 @@ def __getattr__(self, name: str) -> Any:
"direct config name access",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
return self._internal_dict[name]
- raise AttributeError(
- f"'{type(self).__name__}' object has no attribute '{name}'")
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
def save_config(
- self,
- save_directory: Union[str, os.PathLike],
- push_to_hub: bool=False,
- to_diffusers=False,
- **kwargs, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ push_to_hub: bool = False,
+ to_diffusers=False,
+ **kwargs,
+ ):
"""
Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
[`~ConfigMixin.from_config`] class method.
@@ -168,9 +161,7 @@ def save_config(
Directory where the configuration JSON file will be saved (will be created if it does not exist).
"""
if os.path.isfile(save_directory):
- raise AssertionError(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
@@ -182,10 +173,11 @@ def save_config(
@classmethod
def from_config(
- cls,
- config: Union[FrozenDict, Dict[str, Any]]=None,
- return_unused_kwargs=False,
- **kwargs, ):
+ cls,
+ config: Union[FrozenDict, Dict[str, Any]] = None,
+ return_unused_kwargs=False,
+ **kwargs,
+ ):
r"""
Instantiate a Python class from a config dictionary
@@ -222,9 +214,7 @@ def from_config(
config = kwargs.pop("pretrained_model_name_or_path")
if config is None:
- raise ValueError(
- "Please make sure to provide a config as the first positional argument."
- )
+ raise ValueError("Please make sure to provide a config as the first positional argument.")
# ======>
if not isinstance(config, dict):
@@ -233,24 +223,27 @@ def from_config(
deprecation_message += (
f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
" Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
- " be removed in v1.0.0.")
+ " be removed in v1.0.0."
+ )
elif "Model" in cls.__name__:
deprecation_message += (
f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
- " instead. This functionality will be removed in v1.0.0.")
+ " instead. This functionality will be removed in v1.0.0."
+ )
deprecate(
"config-passed-as-path",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
config, kwargs = cls.load_config(
pretrained_model_name_or_path=config,
return_unused_kwargs=True,
- **kwargs, )
+ **kwargs,
+ )
- init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config,
- **kwargs)
+ init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
# Allow dtype to be specified on initialization
if "dtype" in unused_kwargs:
@@ -259,8 +252,7 @@ def from_config(
# add possible deprecated kwargs
for deprecated_kwarg in cls._deprecated_kwargs:
if deprecated_kwarg in unused_kwargs:
- init_dict[deprecated_kwarg] = unused_kwargs.pop(
- deprecated_kwarg)
+ init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
# Return model and optionally state and/or unused_kwargs
model = cls(**init_dict)
@@ -269,7 +261,7 @@ def from_config(
model.register_to_config(**hidden_dict)
# add hidden kwargs of compatible classes to unused_kwargs
- unused_kwargs = { ** unused_kwargs, ** hidden_dict}
+ unused_kwargs = {**unused_kwargs, **hidden_dict}
if return_unused_kwargs:
return (model, unused_kwargs)
@@ -280,21 +272,19 @@ def from_config(
def get_config_dict(cls, *args, **kwargs):
deprecation_message = (
f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
- " removed in version v1.0.0")
- deprecate(
- "get_config_dict",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " removed in version v1.0.0"
+ )
+ deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
return cls.load_config(*args, **kwargs)
@classmethod
def load_config(
- cls,
- pretrained_model_name_or_path: Union[str, os.PathLike],
- return_unused_kwargs=False,
- return_commit_hash=False,
- **kwargs, ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ cls,
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ return_unused_kwargs=False,
+ return_commit_hash=False,
+ **kwargs,
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
r"""
Instantiate a Python class from a config dictionary
@@ -354,8 +344,9 @@ def load_config(
"""
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
@@ -365,7 +356,7 @@ def load_config(
_ = kwargs.pop("mirror", None)
subfolder = kwargs.pop("subfolder", None)
user_agent = kwargs.pop("user_agent", {})
- user_agent = { ** user_agent, "file_type": "config"}
+ user_agent = {**user_agent, "file_type": "config"}
user_agent = http_user_agent(user_agent)
# new add return_config_file
return_config_file = kwargs.pop("return_config_file", False)
@@ -381,17 +372,13 @@ def load_config(
if os.path.isfile(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
elif os.path.isdir(pretrained_model_name_or_path):
- if os.path.isfile(
- os.path.join(pretrained_model_name_or_path,
- cls.config_name)):
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
# Load from a PyTorch checkpoint
- config_file = os.path.join(pretrained_model_name_or_path,
- cls.config_name)
+ config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
elif subfolder is not None and os.path.isfile(
- os.path.join(pretrained_model_name_or_path, subfolder,
- cls.config_name)):
- config_file = os.path.join(pretrained_model_name_or_path,
- subfolder, cls.config_name)
+ os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+ ):
+ config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
else:
raise EnvironmentError(
f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
@@ -409,7 +396,8 @@ def load_config(
user_agent=user_agent,
subfolder=subfolder,
revision=revision,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
try:
# Load config dict
@@ -417,23 +405,20 @@ def load_config(
commit_hash = extract_commit_hash(config_file)
except (json.JSONDecodeError, UnicodeDecodeError):
- raise EnvironmentError(
- f"It looks like the config file at '{config_file}' is not a valid JSON file."
- )
+ raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
- if not (return_unused_kwargs or return_commit_hash or
- return_config_file):
+ if not (return_unused_kwargs or return_commit_hash or return_config_file):
return config_dict
- outputs = (config_dict, )
+ outputs = (config_dict,)
if return_unused_kwargs:
- outputs += (kwargs, )
+ outputs += (kwargs,)
if return_commit_hash:
- outputs += (commit_hash, )
+ outputs += (commit_hash,)
if return_config_file:
- outputs += (config_file, )
+ outputs += (config_file,)
return outputs
@@ -462,43 +447,26 @@ def extract_init_dict(cls, config_dict, **kwargs):
ppdiffusers_library = importlib.import_module(__name__.split(".")[0])
if cls.has_compatibles:
- compatible_classes = [
- c for c in cls._get_compatibles()
- if not isinstance(c, DummyObject)
- ]
+ compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
else:
compatible_classes = []
expected_keys_comp_cls = set()
for c in compatible_classes:
expected_keys_c = cls._get_init_keys(c)
- expected_keys_comp_cls = expected_keys_comp_cls.union(
- expected_keys_c)
- expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(
- cls)
- config_dict = {
- k: v
- for k, v in config_dict.items() if k not in expected_keys_comp_cls
- }
+ expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+ expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+ config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
# remove attributes from orig class that cannot be expected
orig_cls_name = config_dict.pop("_class_name", cls.__name__)
- if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library,
- orig_cls_name):
+ if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library, orig_cls_name):
orig_cls = getattr(ppdiffusers_library, orig_cls_name)
- unexpected_keys_from_orig = cls._get_init_keys(
- orig_cls) - expected_keys
- config_dict = {
- k: v
- for k, v in config_dict.items()
- if k not in unexpected_keys_from_orig
- }
+ unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+ config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
# remove private attributes
- config_dict = {
- k: v
- for k, v in config_dict.items() if not k.startswith("_")
- }
+ config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
# 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
init_dict = {}
@@ -520,7 +488,8 @@ def extract_init_dict(cls, config_dict, **kwargs):
logger.warning(
f"The config attributes {config_dict} were passed to {cls.__name__}, "
"but are not expected and will be ignored. Please verify your "
- f"{cls.config_name} configuration file.")
+ f"{cls.config_name} configuration file."
+ )
# 5. Give nice info if config attributes are initiliazed to default because they have not been passed
passed_keys = set(init_dict.keys())
@@ -530,13 +499,10 @@ def extract_init_dict(cls, config_dict, **kwargs):
)
# 6. Define unused keyword arguments
- unused_kwargs = { ** config_dict, ** kwargs}
+ unused_kwargs = {**config_dict, **kwargs}
# 7. Define "hidden" config parameters that were saved for compatible classes
- hidden_config_dict = {
- k: v
- for k, v in original_dict.items() if k not in init_dict
- }
+ hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
return init_dict, unused_kwargs, hidden_config_dict
@@ -546,8 +512,7 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
text = reader.read()
data = json.loads(text)
if "_diffusers_version" in data and "_ppdiffusers_version" not in data:
- data["_ppdiffusers_version"] = data.pop("_diffusers_version",
- __version__)
+ data["_ppdiffusers_version"] = data.pop("_diffusers_version", __version__)
if "_diffusers_version" not in data and "_ppdiffusers_version" not in data:
data["_ppdiffusers_version"] = __version__
@@ -581,8 +546,7 @@ def to_json_string(self, to_diffusers=False) -> str:
Returns:
`str`: String containing all the attributes that make up this configuration instance in JSON format.
"""
- config_dict = self._internal_dict if hasattr(self,
- "_internal_dict") else {}
+ config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
config_dict["_class_name"] = self.__class__.__name__
# json
@@ -609,14 +573,12 @@ def to_json_saveable(value):
config_dict.pop("_ignore_files", None)
json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
if to_diffusers:
- json_string = json_string.replace(
- '"ppdiffusers"', '"diffusers"').replace(
- '"paddlenlp.transformers"', '"transformers"')
+ json_string = json_string.replace('"ppdiffusers"', '"diffusers"').replace(
+ '"paddlenlp.transformers"', '"transformers"'
+ )
return json_string
- def to_json_file(self,
- json_file_path: Union[str, os.PathLike],
- to_diffusers=False):
+ def to_json_file(self, json_file_path: Union[str, os.PathLike], to_diffusers=False):
"""
Save this instance to a JSON file.
@@ -641,41 +603,39 @@ def register_to_config(init):
def inner_init(self, *args, **kwargs):
# Ignore private kwargs in the init.
init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
- config_init_kwargs = {
- k: v
- for k, v in kwargs.items() if k.startswith("_")
- }
+ config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
if not isinstance(self, ConfigMixin):
raise RuntimeError(
f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
- "not inherit from `ConfigMixin`.")
+ "not inherit from `ConfigMixin`."
+ )
ignore = getattr(self, "ignore_for_config", [])
# Get positional arguments aligned with kwargs
new_kwargs = {}
signature = inspect.signature(init)
parameters = {
- name: p.default
- for i, (name, p) in enumerate(signature.parameters.items())
- if i > 0 and name not in ignore
+ name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
}
for arg, name in zip(args, parameters.keys()):
new_kwargs[name] = arg
# Then add all kwargs
- new_kwargs.update({
- k: init_kwargs.get(k, default)
- for k, default in parameters.items()
- if k not in ignore and k not in new_kwargs
- })
- new_kwargs = { ** config_init_kwargs, ** new_kwargs}
+ new_kwargs.update(
+ {
+ k: init_kwargs.get(k, default)
+ for k, default in parameters.items()
+ if k not in ignore and k not in new_kwargs
+ }
+ )
+ new_kwargs = {**config_init_kwargs, **new_kwargs}
getattr(self, "register_to_config")(**new_kwargs)
init(self, *args, **init_kwargs)
return inner_init
-def finfo(dtype: paddle.dtype=None):
+def finfo(dtype: paddle.dtype = None):
if dtype is None:
dtype = paddle.get_default_dtype()
@@ -699,10 +659,11 @@ class ModuleUtilsMixin:
"""
def get_extended_attention_mask(
- self,
- attention_mask: paddle.Tensor,
- input_shape: Tuple[int],
- dtype: paddle.float32=None, ) -> paddle.Tensor:
+ self,
+ attention_mask: paddle.Tensor,
+ input_shape: Tuple[int],
+ dtype: paddle.float32 = None,
+ ) -> paddle.Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
@@ -725,14 +686,15 @@ def get_extended_attention_mask(
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
- "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
- format(input_shape, attention_mask.shape))
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+ input_shape, attention_mask.shape
+ )
+ )
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
- extended_attention_mask = (
- 1.0 - extended_attention_mask) * finfo(dtype).min
+ extended_attention_mask = (1.0 - extended_attention_mask) * finfo(dtype).min
return extended_attention_mask
diff --git a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
index 81cca5941a71a..730f5b91dba6c 100644
--- a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
+++ b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
@@ -40,11 +40,12 @@ class ValueGuidedRLPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- value_function: UNet1DModel,
- unet: UNet1DModel,
- scheduler: DDPMScheduler,
- env, ):
+ self,
+ value_function: UNet1DModel,
+ unet: UNet1DModel,
+ scheduler: DDPMScheduler,
+ env,
+ ):
super().__init__()
self.value_function = value_function
self.unet = unet
@@ -89,14 +90,13 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale):
y = None
for i in self.progress_bar(self.scheduler.timesteps):
# create batch of timesteps to pass into model
- timesteps = paddle.full((batch_size, ), i, dtype=paddle.int64)
+ timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
for _ in range(n_guide_steps):
with paddle.set_grad_enabled(True):
x.stop_gradient = False
# permute to match dimension for pre-trained models
- y = self.value_function(x.transpose([0, 2, 1]),
- timesteps).sample
+ y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
grad = paddle.autograd.grad([y.sum()], [x])[0]
posterior_variance = self.scheduler._get_variance(i)
@@ -108,24 +108,17 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale):
x = x + scale * grad
x = self.reset_x0(x, conditions, self.action_dim)
- prev_x = self.unet(x.transpose([0, 2, 1]),
- timesteps).sample.transpose([0, 2, 1])
+ prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
# TODO: verify deprecation of this kwarg
- x = self.scheduler.step(
- prev_x, i, x, predict_epsilon=False)["prev_sample"]
+ x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
# apply conditions to the trajectory (set the initial state)
x = self.reset_x0(x, conditions, self.action_dim)
x = self.to_paddle(x)
return x, y
- def __call__(self,
- obs,
- batch_size=64,
- planning_horizon=32,
- n_guide_steps=2,
- scale=0.1):
+ def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
# normalize the observations and create batch dimension
obs = self.normalize(obs, "observations")
obs = obs[None].repeat(batch_size, axis=0)
@@ -144,7 +137,7 @@ def __call__(self,
# sort output trajectories by value
sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
sorted_values = x[sorted_idx]
- actions = sorted_values[:, :, :self.action_dim]
+ actions = sorted_values[:, :, : self.action_dim]
actions = actions.detach().cpu().numpy()
denorm_actions = self.de_normalize(actions, key="actions")
diff --git a/ppdiffusers/ppdiffusers/image_processor.py b/ppdiffusers/ppdiffusers/image_processor.py
index 3e52c14b439c4..82f9dd5f2c682 100644
--- a/ppdiffusers/ppdiffusers/image_processor.py
+++ b/ppdiffusers/ppdiffusers/image_processor.py
@@ -48,12 +48,13 @@ class VaeImageProcessor(ConfigMixin):
@register_to_config
def __init__(
- self,
- do_resize: bool=True,
- vae_scale_factor: int=8,
- resample: str="lanczos",
- do_normalize: bool=True,
- do_convert_rgb: bool=False, ):
+ self,
+ do_resize: bool = True,
+ vae_scale_factor: int = 8,
+ resample: str = "lanczos",
+ do_normalize: bool = True,
+ do_convert_rgb: bool = False,
+ ):
super().__init__()
@staticmethod
@@ -66,26 +67,20 @@ def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
images = (images * 255).round().astype("uint8")
if images.shape[-1] == 1:
# special case for grayscale (single channel) images
- pil_images = [
- Image.fromarray(
- image.squeeze(), mode="L") for image in images
- ]
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
else:
pil_images = [Image.fromarray(image) for image in images]
return pil_images
@staticmethod
- def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]
- ) -> np.ndarray:
+ def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
"""
Convert a PIL image or a list of PIL images to numpy arrays.
"""
if not isinstance(images, list):
images = [images]
- images = [
- np.array(image).astype(np.float32) / 255.0 for image in images
- ]
+ images = [np.array(image).astype(np.float32) / 255.0 for image in images]
images = np.stack(images, axis=0)
return images
@@ -132,10 +127,11 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
return image
def resize(
- self,
- image: PIL.Image.Image,
- height: Optional[int]=None,
- width: Optional[int]=None, ) -> PIL.Image.Image:
+ self,
+ image: PIL.Image.Image,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ ) -> PIL.Image.Image:
"""
Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
"""
@@ -144,20 +140,18 @@ def resize(
if width is None:
width = image.width
- width, height = (x - x % self.config.vae_scale_factor
- for x in (width, height)
- ) # resize to integer multiple of vae_scale_factor
- image = image.resize(
- (width, height), resample=PIL_INTERPOLATION[self.config.resample])
+ width, height = (
+ x - x % self.config.vae_scale_factor for x in (width, height)
+ ) # resize to integer multiple of vae_scale_factor
+ image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
return image
def preprocess(
- self,
- image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
- height: Optional[int]=None,
- width: Optional[int]=None,
- do_normalize: Optional[
- bool]=None, # new added, not exists in diffusers
+ self,
+ image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ do_normalize: Optional[bool] = None, # new added, not exists in diffusers
) -> paddle.Tensor:
"""
Preprocess the image input, accepted formats are PIL images, numpy arrays or paddle tensors"
@@ -165,8 +159,7 @@ def preprocess(
supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor)
if isinstance(image, supported_formats):
image = [image]
- elif not (isinstance(image, list) and
- all(isinstance(i, supported_formats) for i in image)):
+ elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
raise ValueError(
f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
)
@@ -180,23 +173,19 @@ def preprocess(
image = self.numpy_to_pd(image) # to pd
elif isinstance(image[0], np.ndarray):
- image = (np.concatenate(
- image, axis=0) if image[0].ndim == 4 else np.stack(
- image, axis=0))
+ image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
image = self.numpy_to_pd(image)
_, _, height, width = image.shape
if self.config.do_resize and (
- height % self.config.vae_scale_factor != 0 or
- width % self.config.vae_scale_factor != 0):
+ height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+ ):
raise ValueError(
f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
)
elif isinstance(image[0], paddle.Tensor):
- image = (paddle.concat(
- image, axis=0) if image[0].ndim == 4 else paddle.stack(
- image, axis=0))
+ image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
_, channel, height, width = image.shape
# don't need any preprocess if the image is latents
@@ -204,21 +193,21 @@ def preprocess(
return image
if self.config.do_resize and (
- height % self.config.vae_scale_factor != 0 or
- width % self.config.vae_scale_factor != 0):
+ height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+ ):
raise ValueError(
f"Currently we only support resizing for PIL image - please resize your paddle tensor to be divisible by {self.config.vae_scale_factor}"
f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
)
# expected range [0,1], normalize to [-1,1]
- do_normalize = (self.config.do_normalize
- if do_normalize is None else do_normalize)
+ do_normalize = self.config.do_normalize if do_normalize is None else do_normalize
if image.min() < 0:
warnings.warn(
"Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
- FutureWarning, )
+ FutureWarning,
+ )
do_normalize = False
if do_normalize:
@@ -227,10 +216,11 @@ def preprocess(
return image
def postprocess(
- self,
- image: paddle.Tensor,
- output_type: str="pil",
- do_denormalize: Optional[List[bool]]=None, ):
+ self,
+ image: paddle.Tensor,
+ output_type: str = "pil",
+ do_denormalize: Optional[List[bool]] = None,
+ ):
if not isinstance(image, paddle.Tensor):
raise ValueError(
f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor"
@@ -238,12 +228,14 @@ def postprocess(
if output_type not in ["latent", "pd", "np", "pil"]:
deprecation_message = (
f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
- "`pil`, `np`, `pd`, `latent`")
+ "`pil`, `np`, `pd`, `latent`"
+ )
deprecate(
"Unsupported output_type",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
output_type = "np"
if output_type == "latent":
@@ -252,10 +244,9 @@ def postprocess(
if do_denormalize is None:
do_denormalize = [self.config.do_normalize] * image.shape[0]
- image = paddle.stack([
- self.denormalize(image[i]) if do_denormalize[i] else image[i]
- for i in range(image.shape[0])
- ])
+ image = paddle.stack(
+ [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+ )
if output_type == "pd":
return image
diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py
index 934518d67b9d6..da64eb0e6ec9d 100644
--- a/ppdiffusers/ppdiffusers/loaders.py
+++ b/ppdiffusers/ppdiffusers/loaders.py
@@ -24,16 +24,31 @@
from huggingface_hub import hf_hub_download
from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
-from .models.attention_processor import (CustomDiffusionAttnProcessor,
- CustomDiffusionXFormersAttnProcessor,
- LoRAAttnProcessor)
+from .models.attention_processor import (
+ CustomDiffusionAttnProcessor,
+ CustomDiffusionXFormersAttnProcessor,
+ LoRAAttnProcessor,
+)
from .models.modeling_utils import convert_state_dict
-from .utils import (DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB,
- HF_HUB_OFFLINE, PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE,
- TO_DIFFUSERS, _get_model_file, is_paddlenlp_available,
- is_safetensors_available, is_torch_available, is_torch_file,
- logging, ppdiffusers_url_download, safetensors_load,
- smart_load, torch_load)
+from .utils import (
+ DIFFUSERS_CACHE,
+ FROM_DIFFUSERS,
+ FROM_HF_HUB,
+ HF_HUB_OFFLINE,
+ PPDIFFUSERS_CACHE,
+ TEXT_ENCODER_ATTN_MODULE,
+ TO_DIFFUSERS,
+ _get_model_file,
+ is_paddlenlp_available,
+ is_safetensors_available,
+ is_torch_available,
+ is_torch_file,
+ logging,
+ ppdiffusers_url_download,
+ safetensors_load,
+ smart_load,
+ torch_load,
+)
logger = logging.get_logger(__name__)
@@ -68,11 +83,9 @@ def transpose_state_dict(state_dict, name_mapping=None):
for old_name, new_name in name_mapping.items():
k = k.replace(old_name, new_name)
if v.ndim == 2:
- new_state_dict[k] = v.T.contiguous() if hasattr(
- v, "contiguous") else v.T
+ new_state_dict[k] = v.T.contiguous() if hasattr(v, "contiguous") else v.T
else:
- new_state_dict[k] = v.contiguous() if hasattr(v,
- "contiguous") else v
+ new_state_dict[k] = v.contiguous() if hasattr(v, "contiguous") else v
return new_state_dict
@@ -110,8 +123,7 @@ def map_from(module, state_dict, *args, **kwargs):
all_keys = list(state_dict.keys())
for key in all_keys:
replace_key = remap_key(key, state_dict)
- new_key = key.replace(
- replace_key, f"layers.{module.rev_mapping[replace_key]}")
+ new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
state_dict[new_key] = state_dict[key]
del state_dict[key]
@@ -124,10 +136,10 @@ class UNet2DConditionLoadersMixin:
unet_name = UNET_NAME
def load_attn_procs(
- self,
- pretrained_model_name_or_path_or_dict: Union[str, Dict[
- str, paddle.Tensor]],
- **kwargs, ):
+ self,
+ pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+ **kwargs,
+ ):
r"""
Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be
defined in
@@ -186,8 +198,9 @@ def load_attn_procs(
"""
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
@@ -202,8 +215,7 @@ def load_attn_procs(
# See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
network_alpha = kwargs.pop("network_alpha", None)
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -221,13 +233,12 @@ def load_attn_procs(
if from_diffusers:
# Let's first try to load .safetensors weights
if (use_safetensors and weight_name is None) or (
- weight_name is not None and
- weight_name.endswith(".safetensors")):
+ weight_name is not None and weight_name.endswith(".safetensors")
+ ):
try:
model_file = _get_model_file(
pretrained_model_name_or_path_or_dict,
- weights_name=weight_name or
- TORCH_LORA_WEIGHT_NAME_SAFE,
+ weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -237,7 +248,8 @@ def load_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
except Exception:
model_file = None
@@ -255,7 +267,8 @@ def load_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
model_file = _get_model_file(
@@ -270,7 +283,8 @@ def load_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
state_dict = pretrained_model_name_or_path_or_dict
@@ -279,53 +293,42 @@ def load_attn_procs(
attn_processors = {}
is_lora = all("lora" in k for k in state_dict.keys())
- is_custom_diffusion = any("custom_diffusion" in k
- for k in state_dict.keys())
+ is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
if from_diffusers or is_torch_file(model_file):
state_dict = transpose_state_dict(state_dict)
if is_lora:
is_new_lora_format = all(
- key.startswith(self.unet_name) or
- key.startswith(self.text_encoder_name)
- for key in state_dict.keys())
+ key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+ )
if is_new_lora_format:
# Strip the `"unet"` prefix.
- is_text_encoder_present = any(
- key.startswith(self.text_encoder_name)
- for key in state_dict.keys())
+ is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
if is_text_encoder_present:
warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
warnings.warn(warn_message)
- unet_keys = [
- k for k in state_dict.keys() if k.startswith(self.unet_name)
- ]
- state_dict = {
- k.replace(f"{self.unet_name}.", ""): v
- for k, v in state_dict.items() if k in unet_keys
- }
+ unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+ state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
lora_grouped_dict = defaultdict(dict)
for key, value in state_dict.items():
- attn_processor_key, sub_key = ".".join(key.split(
- ".")[:-3]), ".".join(key.split(".")[-3:])
+ attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
- dtype="float32") # we must cast this to float32
+ dtype="float32"
+ ) # we must cast this to float32
for key, value_dict in lora_grouped_dict.items():
- rank = value_dict["to_k_lora.down.weight"].shape[
- 1] # 0 -> 1, torch vs paddle nn.Linear
- cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[
- 0] # 1 -> 0, torch vs paddle nn.Linear
- hidden_size = value_dict["to_k_lora.up.weight"].shape[
- 1] # 0 -> 1, torch vs paddle nn.Linear
+ rank = value_dict["to_k_lora.down.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear
+ cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0] # 1 -> 0, torch vs paddle nn.Linear
+ hidden_size = value_dict["to_k_lora.up.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear
attn_processors[key] = LoRAAttnProcessor(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
rank=rank,
- network_alpha=network_alpha, )
+ network_alpha=network_alpha,
+ )
attn_processors[key].load_dict(value_dict)
elif is_custom_diffusion:
custom_diffusion_grouped_dict = defaultdict(dict)
@@ -334,16 +337,12 @@ def load_attn_procs(
custom_diffusion_grouped_dict[key] = {}
else:
if "to_out" in key:
- attn_processor_key, sub_key = ".".join(
- key.split(".")[:-3]), ".".join(
- key.split(".")[-3:])
+ attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
else:
- attn_processor_key, sub_key = ".".join(
- key.split(".")[:-2]), ".".join(
- key.split(".")[-2:])
- custom_diffusion_grouped_dict[attn_processor_key][
- sub_key] = value.cast(
- dtype="float32") # we must cast this to float32
+ attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+ custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value.cast(
+ dtype="float32"
+ ) # we must cast this to float32
for key, value_dict in custom_diffusion_grouped_dict.items():
if len(value_dict) == 0:
@@ -351,44 +350,42 @@ def load_attn_procs(
train_kv=False,
train_q_out=False,
hidden_size=None,
- cross_attention_dim=None, )
+ cross_attention_dim=None,
+ )
else:
- cross_attention_dim = value_dict[
- "to_k_custom_diffusion.weight"].shape[
- 0] # 1 -> 0, torch vs paddle nn.Linear
- hidden_size = value_dict[
- "to_k_custom_diffusion.weight"].shape[
- 1] # 0 -> 1, torch vs paddle nn.Linear
- train_q_out = (True if
- "to_q_custom_diffusion.weight" in value_dict
- else False)
+ cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[
+ 0
+ ] # 1 -> 0, torch vs paddle nn.Linear
+ hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[
+ 1
+ ] # 0 -> 1, torch vs paddle nn.Linear
+ train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
attn_processors[key] = CustomDiffusionAttnProcessor(
train_kv=True,
train_q_out=train_q_out,
hidden_size=hidden_size,
- cross_attention_dim=cross_attention_dim, )
+ cross_attention_dim=cross_attention_dim,
+ )
attn_processors[key].load_dict(value_dict)
else:
raise ValueError(
f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
)
# set correct dtype & device
- attn_processors = {
- k: v.to(dtype=self.dtype)
- for k, v in attn_processors.items()
- }
+ attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()}
# set layers
self.set_attn_processor(attn_processors)
def save_attn_procs(
- self,
- save_directory: Union[str, os.PathLike],
- is_main_process: bool=True,
- weight_name: str=None,
- save_function: Callable=None,
- safe_serialization: bool=False,
- to_diffusers: Optional[bool]=None, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ is_main_process: bool = True,
+ weight_name: str = None,
+ save_function: Callable = None,
+ safe_serialization: bool = False,
+ to_diffusers: Optional[bool] = None,
+ ):
r"""
Save an attention processor to a directory, so that it can be re-loaded using the
`[`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`]` method.
@@ -413,34 +410,33 @@ def save_attn_procs(
"""
if to_diffusers is None:
to_diffusers = TO_DIFFUSERS
- if to_diffusers and safe_serialization and not is_safetensors_available(
- ):
- raise ImportError(
- "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
- )
+ if to_diffusers and safe_serialization and not is_safetensors_available():
+ raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
if os.path.isfile(save_directory):
- logger.error(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
os.makedirs(save_directory, exist_ok=True)
is_custom_diffusion = any(
- isinstance(x, (CustomDiffusionAttnProcessor,
- CustomDiffusionXFormersAttnProcessor))
- for (_, x) in self.attn_processors.items())
+ isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+ for (_, x) in self.attn_processors.items()
+ )
if is_custom_diffusion:
- model_to_save = AttnProcsLayers({
- y: x
- for (y, x) in self.attn_processors.items()
- if isinstance(
- x,
- (
- CustomDiffusionAttnProcessor,
- CustomDiffusionXFormersAttnProcessor, ), )
- })
+ model_to_save = AttnProcsLayers(
+ {
+ y: x
+ for (y, x) in self.attn_processors.items()
+ if isinstance(
+ x,
+ (
+ CustomDiffusionAttnProcessor,
+ CustomDiffusionXFormersAttnProcessor,
+ ),
+ )
+ }
+ )
state_dict = model_to_save.state_dict()
for name, attn in self.attn_processors.items():
if len(attn.state_dict()) == 0:
@@ -452,16 +448,13 @@ def save_attn_procs(
if weight_name is None:
if to_diffusers:
if safe_serialization:
- weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE
- if is_custom_diffusion else
- TORCH_LORA_WEIGHT_NAME_SAFE)
+ weight_name = (
+ TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME_SAFE
+ )
else:
- weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME
- if is_custom_diffusion else
- TORCH_LORA_WEIGHT_NAME)
+ weight_name = TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME
else:
- weight_name = (PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if
- is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME)
+ weight_name = PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME
# choose save_function
if save_function is None:
@@ -469,16 +462,13 @@ def save_attn_procs(
if safe_serialization:
if is_torch_available():
_save_function = safetensors.torch.save_file
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
else:
_save_function = safetensors.numpy.save_file
- state_dict = convert_state_dict(
- state_dict, framework="numpy")
+ state_dict = convert_state_dict(state_dict, framework="numpy")
def save_function(weights, filename):
- return _save_function(
- weights, filename, metadata={"format": "pt"})
+ return _save_function(weights, filename, metadata={"format": "pt"})
else:
if not is_torch_available():
@@ -486,8 +476,7 @@ def save_function(weights, filename):
"`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
)
save_function = torch.save
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
state_dict = transpose_state_dict(state_dict)
else:
save_function = paddle.save
@@ -495,9 +484,7 @@ def save_function(weights, filename):
# Save the model
save_function(state_dict, os.path.join(save_directory, weight_name))
- logger.info(
- f"Model weights saved in {os.path.join(save_directory, weight_name)}"
- )
+ logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
class TextualInversionLoaderMixin:
@@ -505,9 +492,7 @@ class TextualInversionLoaderMixin:
Mixin class for loading textual inversion tokens and embeddings to the tokenizer and text encoder.
"""
- def maybe_convert_prompt(self,
- prompt: Union[str, List[str]],
- tokenizer: "PretrainedTokenizer"):
+ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PretrainedTokenizer"):
r"""
Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -533,9 +518,7 @@ def maybe_convert_prompt(self,
return prompts
- def _maybe_convert_prompt(self,
- prompt: str,
- tokenizer: "PretrainedTokenizer"):
+ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
r"""
Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -563,10 +546,11 @@ def _maybe_convert_prompt(self,
return prompt
def load_textual_inversion(
- self,
- pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]],
- token: Optional[str]=None,
- **kwargs, ):
+ self,
+ pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]],
+ token: Optional[str] = None,
+ **kwargs,
+ ):
r"""
Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and
`Automatic1111` formats are supported (see example below).
@@ -643,20 +627,21 @@ def load_textual_inversion(
image.save("character.png")
```
"""
- if not hasattr(self, "tokenizer") or not isinstance(
- self.tokenizer, PretrainedTokenizer):
+ if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer):
raise ValueError(
f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling"
- f" `{self.load_textual_inversion.__name__}`")
+ f" `{self.load_textual_inversion.__name__}`"
+ )
- if not hasattr(self, "text_encoder") or not isinstance(
- self.text_encoder, PretrainedModel):
+ if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PretrainedModel):
raise ValueError(
f"{self.__class__.__name__} requires `self.text_encoder` of type `PretrainedModel` for calling"
- f" `{self.load_textual_inversion.__name__}`")
+ f" `{self.load_textual_inversion.__name__}`"
+ )
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
@@ -668,8 +653,7 @@ def load_textual_inversion(
weight_name = kwargs.pop("weight_name", None)
use_safetensors = kwargs.pop("use_safetensors", None)
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -685,13 +669,12 @@ def load_textual_inversion(
# Let's first try to load .safetensors weights
if from_diffusers:
if (use_safetensors and weight_name is None) or (
- weight_name is not None and
- weight_name.endswith(".safetensors")):
+ weight_name is not None and weight_name.endswith(".safetensors")
+ ):
try:
model_file = _get_model_file(
pretrained_model_name_or_path,
- weights_name=weight_name or
- TORCH_TEXT_INVERSION_NAME_SAFE,
+ weights_name=weight_name or TORCH_TEXT_INVERSION_NAME_SAFE,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -701,7 +684,8 @@ def load_textual_inversion(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = safetensors_load(model_file)
except Exception:
model_file = None
@@ -719,7 +703,8 @@ def load_textual_inversion(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = torch_load(model_file)
else:
model_file = _get_model_file(
@@ -734,7 +719,8 @@ def load_textual_inversion(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
if is_torch_file(model_file):
try:
state_dict = safetensors_load(model_file)
@@ -759,9 +745,7 @@ def load_textual_inversion(
embedding = state_dict["string_to_param"]["*"]
if token is not None and loaded_token != token:
- logger.warn(
- f"The loaded token: {loaded_token} is overwritten by the passed token {token}."
- )
+ logger.warn(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
else:
token = loaded_token
@@ -795,14 +779,11 @@ def load_textual_inversion(
is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
if is_multi_vector:
- tokens = [token] + [
- f"{token}_{i}" for i in range(1, embedding.shape[0])
- ]
+ tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
embeddings = [e for e in embedding] # noqa: C416
else:
tokens = [token]
- embeddings = [embedding[0]] if len(
- embedding.shape) > 1 else [embedding]
+ embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
# add tokens and get ids
self.tokenizer.add_tokens(tokens)
@@ -812,8 +793,7 @@ def load_textual_inversion(
self.text_encoder.resize_token_embeddings(len(self.tokenizer))
with paddle.no_grad():
for token_id, embedding in zip(token_ids, embeddings):
- self.text_encoder.get_input_embeddings().weight[
- token_id] = embedding
+ self.text_encoder.get_input_embeddings().weight[token_id] = embedding
logger.info(f"Loaded textual inversion embedding for {token}.")
@@ -830,10 +810,10 @@ class LoraLoaderMixin:
unet_name = UNET_NAME
def load_lora_weights(
- self,
- pretrained_model_name_or_path_or_dict: Union[str, Dict[
- str, paddle.Tensor]],
- **kwargs, ):
+ self,
+ pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+ **kwargs,
+ ):
r"""
Load pretrained attention processor layers (such as LoRA) into [`UNet2DConditionModel`] and
[`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)).
@@ -885,8 +865,9 @@ def load_lora_weights(
# Load the main state dict first which has the LoRA layers for either of
# UNet and text encoder or both.
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
@@ -901,8 +882,7 @@ def load_lora_weights(
# set lora scale to a reasonable default
self._lora_scale = 1.0
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -920,13 +900,12 @@ def load_lora_weights(
if from_diffusers:
# Let's first try to load .safetensors weights
if (use_safetensors and weight_name is None) or (
- weight_name is not None and
- weight_name.endswith(".safetensors")):
+ weight_name is not None and weight_name.endswith(".safetensors")
+ ):
try:
model_file = _get_model_file(
pretrained_model_name_or_path_or_dict,
- weights_name=weight_name or
- TORCH_LORA_WEIGHT_NAME_SAFE,
+ weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -936,7 +915,8 @@ def load_lora_weights(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
except Exception:
model_file = None
@@ -954,7 +934,8 @@ def load_lora_weights(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
model_file = _get_model_file(
@@ -969,7 +950,8 @@ def load_lora_weights(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
state_dict = pretrained_model_name_or_path_or_dict
@@ -979,45 +961,39 @@ def load_lora_weights(
# Convert kohya-ss Style LoRA attn procs to ppdiffusers attn procs
network_alpha = None
- if all((k.startswith("lora_te_") or k.startswith("lora_unet_"))
- for k in state_dict.keys()):
- state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(
- state_dict)
+ if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()):
+ state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict)
from_diffusers = True
# If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
# then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
# their prefixes.
keys = list(state_dict.keys())
- if all(
- key.startswith(self.unet_name) or
- key.startswith(self.text_encoder_name) for key in keys):
+ if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys):
# Load the layers corresponding to UNet.
unet_keys = [k for k in keys if k.startswith(self.unet_name)]
logger.info(f"Loading {self.unet_name}.")
unet_lora_state_dict = {
- k.replace(f"{self.unet_name}.", ""): v
- for k, v in state_dict.items() if k in unet_keys
+ k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys
}
self.unet.load_attn_procs(
unet_lora_state_dict,
network_alpha=network_alpha,
- from_diffusers=from_diffusers, )
+ from_diffusers=from_diffusers,
+ )
# Load the layers corresponding to text encoder and make necessary adjustments.
- text_encoder_keys = [
- k for k in keys if k.startswith(self.text_encoder_name)
- ]
+ text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)]
text_encoder_lora_state_dict = {
- k.replace(f"{self.text_encoder_name}.", ""): v
- for k, v in state_dict.items() if k in text_encoder_keys
+ k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
}
if len(text_encoder_lora_state_dict) > 0:
logger.info(f"Loading {self.text_encoder_name}.")
attn_procs_text_encoder = self._load_text_encoder_attn_procs(
text_encoder_lora_state_dict,
network_alpha=network_alpha,
- from_diffusers=from_diffusers, )
+ from_diffusers=from_diffusers,
+ )
self._modify_text_encoder(attn_procs_text_encoder)
# save lora attn procs of text encoder so that it can be easily retrieved
@@ -1026,13 +1002,9 @@ def load_lora_weights(
# Otherwise, we're dealing with the old format. This means the `state_dict` should only
# contain the module names of the `unet` as its keys WITHOUT any prefix.
elif not all(
- key.startswith(self.unet_name) or
- key.startswith(self.text_encoder_name)
- for key in state_dict.keys()):
- self.unet.load_attn_procs(
- state_dict,
- network_alpha=network_alpha,
- from_diffusers=from_diffusers)
+ key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+ ):
+ self.unet.load_attn_procs(state_dict, network_alpha=network_alpha, from_diffusers=from_diffusers)
warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
warnings.warn(warn_message)
@@ -1050,15 +1022,13 @@ def text_encoder_lora_attn_procs(self):
def _remove_text_encoder_monkey_patch(self):
# Loop over the nn.MultiHeadAttention module of text_encoder
- for name, attn_module in self.text_encoder.named_sublayers(
- include_self=True):
+ for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
if name.endswith(TEXT_ENCODER_ATTN_MODULE):
# Loop over the LoRA layers
for (
- _,
- text_encoder_attr,
- ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(
- ):
+ _,
+ text_encoder_attr,
+ ) in self._lora_attn_processor_attr_to_text_encoder_attr.items():
# Retrieve the q/k/v/out projection of nn.MultiHeadAttention
module = attn_module.get_sublayer(text_encoder_attr)
if hasattr(module, "old_forward"):
@@ -1071,8 +1041,7 @@ def _remove_text_encoder_monkey_patch(self):
# del processor
delattr(attn_module, "processor")
- def _modify_text_encoder(self,
- attn_processors: Dict[str, LoRAAttnProcessor]):
+ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
r"""
Monkey-patches the forward passes of attention modules of the text encoder.
@@ -1085,19 +1054,16 @@ def _modify_text_encoder(self,
self._remove_text_encoder_monkey_patch()
# Loop over the nn.MultiHeadAttention module of text_encoder
- for name, attn_module in self.text_encoder.named_sublayers(
- include_self=True):
+ for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
if name.endswith(TEXT_ENCODER_ATTN_MODULE):
# Loop over the LoRA layers
for (
- attn_proc_attr,
- text_encoder_attr,
- ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(
- ):
+ attn_proc_attr,
+ text_encoder_attr,
+ ) in self._lora_attn_processor_attr_to_text_encoder_attr.items():
# Retrieve the q/k/v/out projection of nn.MultiHeadAttention and its corresponding LoRA layer.
module = attn_module.get_sublayer(text_encoder_attr)
- lora_layer = attn_processors[name].get_sublayer(
- attn_proc_attr)
+ lora_layer = attn_processors[name].get_sublayer(attn_proc_attr)
# save old_forward to module that can be used to remove monkey-patch
old_forward = module.old_forward = module.forward
@@ -1105,8 +1071,7 @@ def _modify_text_encoder(self,
# for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060
def make_new_forward(old_forward, lora_layer):
def new_forward(x):
- result = old_forward(
- x) + self.lora_scale * lora_layer(x)
+ result = old_forward(x) + self.lora_scale * lora_layer(x)
return result
return new_forward
@@ -1127,10 +1092,10 @@ def _lora_attn_processor_attr_to_text_encoder_attr(self):
}
def _load_text_encoder_attn_procs(
- self,
- pretrained_model_name_or_path_or_dict: Union[str, Dict[
- str, paddle.Tensor]],
- **kwargs, ):
+ self,
+ pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+ **kwargs,
+ ):
r"""
Load pretrained attention processor layers for
[`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
@@ -1184,8 +1149,9 @@ def _load_text_encoder_attn_procs(
"""
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
@@ -1198,8 +1164,7 @@ def _load_text_encoder_attn_procs(
use_safetensors = kwargs.pop("use_safetensors", None)
network_alpha = kwargs.pop("network_alpha", None)
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -1215,13 +1180,12 @@ def _load_text_encoder_attn_procs(
if from_diffusers:
# Let's first try to load .safetensors weights
if (use_safetensors and weight_name is None) or (
- weight_name is not None and
- weight_name.endswith(".safetensors")):
+ weight_name is not None and weight_name.endswith(".safetensors")
+ ):
try:
model_file = _get_model_file(
pretrained_model_name_or_path_or_dict,
- weights_name=weight_name or
- TORCH_LORA_WEIGHT_NAME_SAFE,
+ weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -1231,7 +1195,8 @@ def _load_text_encoder_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
except Exception:
model_file = None
@@ -1249,7 +1214,8 @@ def _load_text_encoder_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
model_file = _get_model_file(
@@ -1264,7 +1230,8 @@ def _load_text_encoder_attn_procs(
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
state_dict = smart_load(model_file)
else:
state_dict = pretrained_model_name_or_path_or_dict
@@ -1275,55 +1242,48 @@ def _load_text_encoder_attn_procs(
is_lora = all("lora" in k for k in state_dict.keys())
if from_diffusers or is_torch_file(model_file):
- state_dict = transpose_state_dict(
- state_dict, name_mapping={".encoder.": ".transformer."})
+ state_dict = transpose_state_dict(state_dict, name_mapping={".encoder.": ".transformer."})
if is_lora:
lora_grouped_dict = defaultdict(dict)
for key, value in state_dict.items():
- attn_processor_key, sub_key = ".".join(key.split(
- ".")[:-3]), ".".join(key.split(".")[-3:])
+ attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
- dtype="float32") # we must cast this to float32
+ dtype="float32"
+ ) # we must cast this to float32
for key, value_dict in lora_grouped_dict.items():
- rank = value_dict["to_k_lora.down.weight"].shape[
- 1] # 0 -> 1, torch vs paddle nn.Linear
- cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[
- 0] # 1 -> 0, torch vs paddle nn.Linear
- hidden_size = value_dict["to_k_lora.up.weight"].shape[
- 1] # 0 -> 1, torch vs paddle nn.Linear
+ rank = value_dict["to_k_lora.down.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear
+ cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0] # 1 -> 0, torch vs paddle nn.Linear
+ hidden_size = value_dict["to_k_lora.up.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear
attn_processors[key] = LoRAAttnProcessor(
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
rank=rank,
- network_alpha=network_alpha, )
+ network_alpha=network_alpha,
+ )
attn_processors[key].load_dict(value_dict)
else:
- raise ValueError(
- f"{model_file} does not seem to be in the correct format expected by LoRA training."
- )
+ raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
# set correct dtype & device
- attn_processors = {
- k: v.to(dtype=self.text_encoder.dtype)
- for k, v in attn_processors.items()
- }
+ attn_processors = {k: v.to(dtype=self.text_encoder.dtype) for k, v in attn_processors.items()}
return attn_processors
@classmethod
def save_lora_weights(
- self,
- save_directory: Union[str, os.PathLike],
- unet_lora_layers: Dict[str, nn.Layer]=None,
- text_encoder_lora_layers: Dict[str, nn.Layer]=None,
- is_main_process: bool=True,
- weight_name: str=None,
- save_function: Callable=None,
- safe_serialization: bool=False,
- to_diffusers: Optional[bool]=None, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ unet_lora_layers: Dict[str, nn.Layer] = None,
+ text_encoder_lora_layers: Dict[str, nn.Layer] = None,
+ is_main_process: bool = True,
+ weight_name: str = None,
+ save_function: Callable = None,
+ safe_serialization: bool = False,
+ to_diffusers: Optional[bool] = None,
+ ):
r"""
Save the LoRA parameters corresponding to the UNet and the text encoder.
Arguments:
@@ -1347,16 +1307,11 @@ def save_lora_weights(
"""
if to_diffusers is None:
to_diffusers = TO_DIFFUSERS
- if to_diffusers and safe_serialization and not is_safetensors_available(
- ):
- raise ImportError(
- "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
- )
+ if to_diffusers and safe_serialization and not is_safetensors_available():
+ raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
if os.path.isfile(save_directory):
- logger.error(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
os.makedirs(save_directory, exist_ok=True)
@@ -1372,8 +1327,7 @@ def save_lora_weights(
if text_encoder_lora_layers is not None:
text_encoder_lora_state_dict = {
f"{self.text_encoder_name}.{module_name}": param
- for module_name, param in text_encoder_lora_layers.state_dict()
- .items()
+ for module_name, param in text_encoder_lora_layers.state_dict().items()
}
state_dict.update(text_encoder_lora_state_dict)
# TODO junnyu, rename paramaters.
@@ -1394,16 +1348,13 @@ def save_lora_weights(
if safe_serialization:
if is_torch_available():
_save_function = safetensors.torch.save_file
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
else:
_save_function = safetensors.numpy.save_file
- state_dict = convert_state_dict(
- state_dict, framework="numpy")
+ state_dict = convert_state_dict(state_dict, framework="numpy")
def save_function(weights, filename):
- return _save_function(
- weights, filename, metadata={"format": "pt"})
+ return _save_function(weights, filename, metadata={"format": "pt"})
else:
if not is_torch_available():
@@ -1411,17 +1362,13 @@ def save_function(weights, filename):
"`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
)
save_function = torch.save
- state_dict = convert_state_dict(
- state_dict, framework="torch")
- state_dict = transpose_state_dict(
- state_dict, name_mapping={".transformer.": ".encoder."})
+ state_dict = convert_state_dict(state_dict, framework="torch")
+ state_dict = transpose_state_dict(state_dict, name_mapping={".transformer.": ".encoder."})
else:
save_function = paddle.save
save_function(state_dict, os.path.join(save_directory, weight_name))
- logger.info(
- f"Model weights saved in {os.path.join(save_directory, weight_name)}"
- )
+ logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
def _convert_kohya_lora_to_diffusers(self, state_dict):
unet_state_dict = {}
@@ -1442,62 +1389,36 @@ def _convert_kohya_lora_to_diffusers(self, state_dict):
raise ValueError("Network alpha is not consistent")
if lora_name.startswith("lora_unet_"):
- diffusers_name = key.replace("lora_unet_", "").replace("_",
- ".")
- diffusers_name = diffusers_name.replace("down.blocks",
- "down_blocks")
- diffusers_name = diffusers_name.replace("mid.block",
- "mid_block")
- diffusers_name = diffusers_name.replace("up.blocks",
- "up_blocks")
- diffusers_name = diffusers_name.replace(
- "transformer.blocks", "transformer_blocks")
- diffusers_name = diffusers_name.replace("to.q.lora",
- "to_q_lora")
- diffusers_name = diffusers_name.replace("to.k.lora",
- "to_k_lora")
- diffusers_name = diffusers_name.replace("to.v.lora",
- "to_v_lora")
- diffusers_name = diffusers_name.replace("to.out.0.lora",
- "to_out_lora")
+ diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+ diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+ diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+ diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+ diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+ diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+ diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+ diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+ diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
if "transformer_blocks" in diffusers_name:
if "attn1" in diffusers_name or "attn2" in diffusers_name:
- diffusers_name = diffusers_name.replace(
- "attn1", "attn1.processor")
- diffusers_name = diffusers_name.replace(
- "attn2", "attn2.processor")
+ diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+ diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
unet_state_dict[diffusers_name] = value
- unet_state_dict[diffusers_name.replace(
- ".down.", ".up.")] = state_dict[lora_name_up]
+ unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
elif lora_name.startswith("lora_te_"):
- diffusers_name = key.replace("lora_te_", "").replace("_",
- ".")
- diffusers_name = diffusers_name.replace("text.model",
- "text_model")
- diffusers_name = diffusers_name.replace("self.attn",
- "self_attn")
- diffusers_name = diffusers_name.replace("q.proj.lora",
- "to_q_lora")
- diffusers_name = diffusers_name.replace("k.proj.lora",
- "to_k_lora")
- diffusers_name = diffusers_name.replace("v.proj.lora",
- "to_v_lora")
- diffusers_name = diffusers_name.replace("out.proj.lora",
- "to_out_lora")
+ diffusers_name = key.replace("lora_te_", "").replace("_", ".")
+ diffusers_name = diffusers_name.replace("text.model", "text_model")
+ diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+ diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+ diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+ diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+ diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
if "self_attn" in diffusers_name:
te_state_dict[diffusers_name] = value
- te_state_dict[diffusers_name.replace(
- ".down.", ".up.")] = state_dict[lora_name_up]
+ te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
- unet_state_dict = {
- f"{UNET_NAME}.{module_name}": params
- for module_name, params in unet_state_dict.items()
- }
- te_state_dict = {
- f"{TEXT_ENCODER_NAME}.{module_name}": params
- for module_name, params in te_state_dict.items()
- }
- new_state_dict = { ** unet_state_dict, ** te_state_dict}
+ unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()}
+ te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()}
+ new_state_dict = {**unet_state_dict, **te_state_dict}
return new_state_dict, network_alpha
@@ -1582,12 +1503,14 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
```
"""
# import here to avoid circular dependency
- from .pipelines.stable_diffusion.convert_from_ckpt import \
- download_from_original_stable_diffusion_ckpt
+ from .pipelines.stable_diffusion.convert_from_ckpt import (
+ download_from_original_stable_diffusion_ckpt,
+ )
from_hf_hub = "huggingface.co" in pretrained_model_link_or_path or "hf.co"
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
resume_download = kwargs.pop("resume_download", False)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
@@ -1631,22 +1554,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
pretrained_model_link_or_path = str(pretrained_model_link_or_path)
if os.path.isfile(pretrained_model_link_or_path):
checkpoint_path = pretrained_model_link_or_path
- elif pretrained_model_link_or_path.startswith(
- "http://") or pretrained_model_link_or_path.startswith(
- "https://"):
+ elif pretrained_model_link_or_path.startswith("http://") or pretrained_model_link_or_path.startswith(
+ "https://"
+ ):
# HF Hub models
- if any(p in pretrained_model_link_or_path
- for p in ["huggingface.co", "hf.co"]):
+ if any(p in pretrained_model_link_or_path for p in ["huggingface.co", "hf.co"]):
# remove huggingface url
for prefix in [
- "https://huggingface.co/",
- "huggingface.co/",
- "hf.co/",
- "https://hf.co/",
+ "https://huggingface.co/",
+ "huggingface.co/",
+ "hf.co/",
+ "https://hf.co/",
]:
if pretrained_model_link_or_path.startswith(prefix):
- pretrained_model_link_or_path = pretrained_model_link_or_path[
- len(prefix):]
+ pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
# Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
ckpt_path = Path(pretrained_model_link_or_path)
@@ -1656,10 +1577,10 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
file_path = str(Path().joinpath(*ckpt_path.parts[2:]))
if file_path.startswith("blob/"):
- file_path = file_path[len("blob/"):]
+ file_path = file_path[len("blob/") :]
if file_path.startswith("main/"):
- file_path = file_path[len("main/"):]
+ file_path = file_path[len("main/") :]
checkpoint_path = hf_hub_download(
repo_id,
@@ -1670,17 +1591,18 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
local_files_only=local_files_only,
use_auth_token=use_auth_token,
revision=revision,
- force_download=force_download, )
+ force_download=force_download,
+ )
else:
checkpoint_path = ckpt_path
else:
checkpoint_path = ppdiffusers_url_download(
pretrained_model_link_or_path,
cache_dir=cache_dir,
- filename=http_file_name(pretrained_model_link_or_path)
- .strip('"'),
+ filename=http_file_name(pretrained_model_link_or_path).strip('"'),
force_download=force_download,
- resume_download=resume_download, )
+ resume_download=resume_download,
+ )
else:
checkpoint_path = pretrained_model_link_or_path
@@ -1697,18 +1619,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
upcast_attention=upcast_attention,
load_safety_checker=load_safety_checker,
prediction_type=prediction_type,
- paddle_dtype=paddle_dtype, )
+ paddle_dtype=paddle_dtype,
+ )
return pipe
def http_file_name(
- url: str,
- *,
- proxies=None,
- headers: Optional[Dict[str, str]]=None,
- timeout=10.0,
- max_retries=0, ):
+ url: str,
+ *,
+ proxies=None,
+ headers: Optional[Dict[str, str]] = None,
+ timeout=10.0,
+ max_retries=0,
+):
"""
Get a remote file name.
"""
@@ -1720,7 +1644,8 @@ def http_file_name(
proxies=proxies,
headers=headers,
timeout=timeout,
- max_retries=max_retries, )
+ max_retries=max_retries,
+ )
hf_raise_for_status(r)
displayed_name = url.split("/")[-1]
content_disposition = r.headers.get("Content-Disposition")
diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py
index 3269f70a0217e..19d5a1b254b83 100644
--- a/ppdiffusers/ppdiffusers/models/__init__.py
+++ b/ppdiffusers/ppdiffusers/models/__init__.py
@@ -14,8 +14,11 @@
# limitations under the License.
# flake8: noqa
-from ..utils.import_utils import (OptionalDependencyNotAvailable,
- is_einops_available, is_paddle_available)
+from ..utils.import_utils import (
+ OptionalDependencyNotAvailable,
+ is_einops_available,
+ is_paddle_available,
+)
if is_paddle_available():
from .adapter import MultiAdapter, T2IAdapter
diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py
index f51292032a59c..639118f29b348 100644
--- a/ppdiffusers/ppdiffusers/models/adapter.py
+++ b/ppdiffusers/ppdiffusers/models/adapter.py
@@ -22,15 +22,7 @@
class BottleneckResnetBlock(paddle.nn.Layer):
- def __init__(self,
- in_c,
- mid_c,
- out_c,
- down,
- ksize=3,
- sk=False,
- use_conv=True,
- proj_ksize=1):
+ def __init__(self, in_c, mid_c, out_c, down, ksize=3, sk=False, use_conv=True, proj_ksize=1):
super().__init__()
ps = ksize // 2
proj_pad = proj_ksize // 2
@@ -40,7 +32,8 @@ def __init__(self,
out_channels=mid_c,
kernel_size=proj_ksize,
stride=1,
- padding=proj_pad, )
+ padding=proj_pad,
+ )
else:
self.conv1 = None
if out_c != mid_c:
@@ -49,29 +42,27 @@ def __init__(self,
out_channels=out_c,
kernel_size=proj_ksize,
stride=1,
- padding=proj_pad, )
+ padding=proj_pad,
+ )
else:
self.conv2 = None
- self.block1 = paddle.nn.Conv2D(
- in_channels=mid_c,
- out_channels=mid_c,
- kernel_size=3,
- stride=1,
- padding=1)
+ self.block1 = paddle.nn.Conv2D(in_channels=mid_c, out_channels=mid_c, kernel_size=3, stride=1, padding=1)
self.act = paddle.nn.ReLU()
self.block2 = paddle.nn.Conv2D(
in_channels=mid_c,
out_channels=mid_c,
kernel_size=ksize,
stride=1,
- padding=ps, )
+ padding=ps,
+ )
if sk is False:
self.conv_shortcut = paddle.nn.Conv2D(
in_channels=in_c,
out_channels=mid_c,
kernel_size=ksize,
stride=1,
- padding=ps, )
+ padding=ps,
+ )
else:
self.conv_shortcut = None
self.down = down
@@ -136,20 +127,20 @@ class T2IAdapter(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- block_out_channels: List[int]=[320, 640, 1280, 1280],
- block_mid_channels: Optional[List[int]]=None,
- num_res_blocks: int=3,
- channels_in: int=3,
- kernel_size: int=3,
- proj_kernel_size: int=1,
- res_block_skip: bool=True,
- use_conv: bool=False,
- input_scale_factor: int=8, ):
+ self,
+ block_out_channels: List[int] = [320, 640, 1280, 1280],
+ block_mid_channels: Optional[List[int]] = None,
+ num_res_blocks: int = 3,
+ channels_in: int = 3,
+ kernel_size: int = 3,
+ proj_kernel_size: int = 1,
+ res_block_skip: bool = True,
+ use_conv: bool = False,
+ input_scale_factor: int = 8,
+ ):
super(T2IAdapter, self).__init__()
self.num_downsample_blocks = len(block_out_channels)
- self.unshuffle = paddle.nn.PixelUnshuffle(
- downscale_factor=input_scale_factor)
+ self.unshuffle = paddle.nn.PixelUnshuffle(downscale_factor=input_scale_factor)
self.num_res_blocks = num_res_blocks
self.body = []
if block_mid_channels is None:
@@ -166,7 +157,9 @@ def __init__(
ksize=kernel_size,
proj_ksize=proj_kernel_size,
sk=res_block_skip,
- use_conv=use_conv, ))
+ use_conv=use_conv,
+ )
+ )
elif j == num_res_blocks - 1:
self.body.append(
BottleneckResnetBlock(
@@ -177,7 +170,9 @@ def __init__(
ksize=kernel_size,
proj_ksize=proj_kernel_size,
sk=res_block_skip,
- use_conv=use_conv, ))
+ use_conv=use_conv,
+ )
+ )
else:
self.body.append(
BottleneckResnetBlock(
@@ -188,7 +183,9 @@ def __init__(
ksize=kernel_size,
proj_ksize=proj_kernel_size,
sk=res_block_skip,
- use_conv=use_conv, ))
+ use_conv=use_conv,
+ )
+ )
self.body = paddle.nn.LayerList(sublayers=self.body)
if block_mid_channels[0] == block_out_channels[0]:
self.conv_in = paddle.nn.Conv2D(
@@ -196,14 +193,16 @@ def __init__(
out_channels=block_mid_channels[0],
kernel_size=3,
stride=1,
- padding=1, )
+ padding=1,
+ )
else:
self.conv_in = paddle.nn.Conv2D(
in_channels=channels_in * input_scale_factor**2,
out_channels=block_mid_channels[0],
kernel_size=proj_kernel_size,
stride=1,
- padding=proj_kernel_size // 2, )
+ padding=proj_kernel_size // 2,
+ )
def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
"""
@@ -241,9 +240,7 @@ def __init__(self, adapters: List[T2IAdapter]):
self.num_adapter = len(adapters)
self.adapters = paddle.nn.LayerList(sublayers=adapters)
- def forward(
- self, xs: paddle.Tensor,
- adapter_weights: Optional[List[float]]=None) -> List[paddle.Tensor]:
+ def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]:
"""
Args:
xs (`torch.Tensor`):
@@ -254,8 +251,7 @@ def forward(
them together.
"""
if adapter_weights is None:
- adapter_weights = paddle.to_tensor([1 / self.num_adapter] *
- self.num_adapter)
+ adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter)
else:
adapter_weights = paddle.to_tensor(adapter_weights)
if xs.shape[1] % self.num_adapter != 0:
diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
index 47ae9ef9aa303..199e115810a3e 100644
--- a/ppdiffusers/ppdiffusers/models/attention.py
+++ b/ppdiffusers/ppdiffusers/models/attention.py
@@ -24,7 +24,7 @@
from .embeddings import CombinedTimestepLabelEmbeddings
-def drop_path(input, drop_prob: float=0.0, training: bool=False):
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
@@ -37,8 +37,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False):
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
- shape = (input.shape[0], ) + (1, ) * (
- input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = (input / keep_prob) * random_tensor
@@ -48,7 +47,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False):
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
- def __init__(self, drop_prob: Optional[float]=None) -> None:
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
@@ -61,12 +60,13 @@ def extra_repr(self) -> str:
class Mlp(nn.Layer):
def __init__(
- self,
- in_features,
- hidden_features=None,
- out_features=None,
- act_layer=nn.GELU,
- drop=0.0, ):
+ self,
+ in_features,
+ hidden_features=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ drop=0.0,
+ ):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
@@ -103,22 +103,21 @@ class AttentionBlock(nn.Layer):
# IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore
def __init__(
- self,
- channels: int,
- num_head_channels: Optional[int]=None,
- norm_num_groups: int=32,
- rescale_output_factor: float=1.0,
- eps: float=1e-5, ):
+ self,
+ channels: int,
+ num_head_channels: Optional[int] = None,
+ norm_num_groups: int = 32,
+ rescale_output_factor: float = 1.0,
+ eps: float = 1e-5,
+ ):
super().__init__()
self.channels = channels
- self.num_heads = (channels // num_head_channels
- if num_head_channels is not None else 1)
+ self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
self.head_size = self.channels // self.num_heads
self.scale = 1 / math.sqrt(self.channels / self.num_heads)
- self.group_norm = nn.GroupNorm(
- num_channels=channels, num_groups=norm_num_groups, epsilon=eps)
+ self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, epsilon=eps)
# define q,k,v as linear layers
self.query = nn.Linear(channels, channels)
@@ -132,10 +131,7 @@ def __init__(
self._use_2_5_attn = True
self._attention_op = None
- def reshape_heads_to_batch_dim(self,
- tensor,
- transpose=True,
- merge_head_and_batch=False):
+ def reshape_heads_to_batch_dim(self, tensor, transpose=True, merge_head_and_batch=False):
tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
# currently we donot use `unmerge_head_and_batch`
if transpose or merge_head_and_batch:
@@ -145,15 +141,11 @@ def reshape_heads_to_batch_dim(self,
tensor = tensor.flatten(0, 1)
return tensor
- def reshape_batch_dim_to_heads(self,
- tensor,
- transpose=True,
- unmerge_head_and_batch=False):
+ def reshape_batch_dim_to_heads(self, tensor, transpose=True, unmerge_head_and_batch=False):
# currently we donot use `unmerge_head_and_batch`
if unmerge_head_and_batch:
seq_len = tensor.shape[1]
- tensor = tensor.reshape(
- [-1, self.num_heads, seq_len, self.head_size])
+ tensor = tensor.reshape([-1, self.num_heads, seq_len, self.head_size])
if transpose or unmerge_head_and_batch:
tensor = tensor.transpose([0, 2, 1, 3])
@@ -162,9 +154,10 @@ def reshape_batch_dim_to_heads(self,
return tensor
def set_use_memory_efficient_attention_xformers(
- self,
- use_memory_efficient_attention_xformers: bool,
- attention_op: Optional[str]=None, ):
+ self,
+ use_memory_efficient_attention_xformers: bool,
+ attention_op: Optional[str] = None,
+ ):
# remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
# if self.head_size > 128 and attention_op == "flash":
# attention_op = "cutlass"
@@ -176,18 +169,15 @@ def set_use_memory_efficient_attention_xformers(
else:
try:
_ = F.scaled_dot_product_attention_(
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- attention_op=attention_op, )
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ attention_op=attention_op,
+ )
except Exception as e:
raise e
- self._use_memory_efficient_attention_xformers = (
- use_memory_efficient_attention_xformers)
+ self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
self._attention_op = attention_op
def forward(self, hidden_states):
@@ -197,8 +187,7 @@ def forward(self, hidden_states):
# norm
hidden_states = self.group_norm(hidden_states)
- hidden_states = hidden_states.reshape(
- [batch, channel, height * width]).transpose([0, 2, 1])
+ hidden_states = hidden_states.reshape([batch, channel, height * width]).transpose([0, 2, 1])
# proj to q, k, v
query_proj = self.query(hidden_states)
@@ -206,14 +195,14 @@ def forward(self, hidden_states):
value_proj = self.value(hidden_states)
query_proj = self.reshape_heads_to_batch_dim(
- query_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ query_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
key_proj = self.reshape_heads_to_batch_dim(
- key_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ key_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
value_proj = self.reshape_heads_to_batch_dim(
- value_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ value_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
if self._use_memory_efficient_attention_xformers:
hidden_states = F.scaled_dot_product_attention_(
@@ -224,25 +213,22 @@ def forward(self, hidden_states):
scale=self.scale,
dropout_p=0.0,
training=self.training,
- attention_op=self._attention_op, )
+ attention_op=self._attention_op,
+ )
else:
- attention_scores = (paddle.matmul(
- query_proj, key_proj, transpose_y=True) * self.scale)
- attention_probs = F.softmax(
- attention_scores.cast("float32"),
- axis=-1).cast(attention_scores.dtype)
+ attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
+ attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
hidden_states = paddle.matmul(attention_probs, value_proj)
# reshape hidden_states
hidden_states = self.reshape_batch_dim_to_heads(
- hidden_states,
- transpose=not self._use_memory_efficient_attention_xformers)
+ hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+ )
# compute next hidden_states
hidden_states = self.proj_attn(hidden_states)
- hidden_states = hidden_states.transpose([0, 2, 1]).reshape(
- [batch, channel, height, width])
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch, channel, height, width])
# res connect and rescale
hidden_states = (hidden_states + residual) / self.rescale_output_factor
@@ -271,31 +257,29 @@ class BasicTransformerBlock(nn.Layer):
"""
def __init__(
- self,
- dim: int,
- num_attention_heads: int,
- attention_head_dim: int,
- dropout=0.0,
- cross_attention_dim: Optional[int]=None,
- activation_fn: str="geglu",
- num_embeds_ada_norm: Optional[int]=None,
- attention_bias: bool=False,
- only_cross_attention: bool=False,
- double_self_attention: bool=False,
- upcast_attention: bool=False,
- norm_elementwise_affine: bool=True,
- norm_type: str="layer_norm",
- final_dropout: bool=False, ):
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ norm_type: str = "layer_norm",
+ final_dropout: bool = False,
+ ):
super().__init__()
self.only_cross_attention = only_cross_attention
- self.use_ada_layer_norm_zero = (
- num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
- self.use_ada_layer_norm = (
- num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
- if norm_type in ("ada_norm", "ada_norm_zero"
- ) and num_embeds_ada_norm is None:
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
@@ -320,22 +304,21 @@ def __init__(
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
- cross_attention_dim=cross_attention_dim
- if only_cross_attention else None,
- upcast_attention=upcast_attention, )
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ )
# 2. Cross-Attn
if cross_attention_dim is not None or double_self_attention:
# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
# the second cross attention block.
- self.norm2 = (AdaLayerNorm(dim, num_embeds_ada_norm)
- if self.use_ada_layer_norm else
- nn.LayerNorm(dim, **norm_kwargs))
+ self.norm2 = (
+ AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim, **norm_kwargs)
+ )
self.attn2 = Attention(
query_dim=dim,
- cross_attention_dim=cross_attention_dim
- if not double_self_attention else None,
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
@@ -352,46 +335,45 @@ def __init__(
dim,
dropout=dropout,
activation_fn=activation_fn,
- final_dropout=final_dropout, )
+ final_dropout=final_dropout,
+ )
def forward(
- self,
- hidden_states,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- timestep=None,
- cross_attention_kwargs=None,
- class_labels=None, ):
+ self,
+ hidden_states,
+ attention_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ timestep=None,
+ cross_attention_kwargs=None,
+ class_labels=None,
+ ):
# Notice that normalization is always applied before the real computation in the following blocks.
# 1. Self-Attention
if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep)
elif self.use_ada_layer_norm_zero:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
- hidden_states,
- timestep,
- class_labels,
- hidden_dtype=hidden_states.dtype)
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
else:
norm_hidden_states = self.norm1(hidden_states)
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
attn_output = self.attn1(
norm_hidden_states,
- encoder_hidden_states=encoder_hidden_states
- if self.only_cross_attention else None,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
if self.use_ada_layer_norm_zero:
attn_output = gate_msa.unsqueeze(1) * attn_output
hidden_states = attn_output + hidden_states
if self.attn2 is not None:
- norm_hidden_states = (self.norm2(hidden_states, timestep)
- if self.use_ada_layer_norm else
- self.norm2(hidden_states))
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
# TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
# prepare attention mask here
@@ -400,15 +382,15 @@ def forward(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
hidden_states = attn_output + hidden_states
# 3. Feed-forward
norm_hidden_states = self.norm3(hidden_states)
if self.use_ada_layer_norm_zero:
- norm_hidden_states = (norm_hidden_states *
- (1 + scale_mlp[:, None]) + shift_mlp[:, None])
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
ff_output = self.ff(norm_hidden_states)
@@ -434,13 +416,14 @@ class FeedForward(nn.Layer):
"""
def __init__(
- self,
- dim: int,
- dim_out: Optional[int]=None,
- mult: int=4,
- dropout: float=0.0,
- activation_fn: str="geglu",
- final_dropout: bool=False, ):
+ self,
+ dim: int,
+ dim_out: Optional[int] = None,
+ mult: int = 4,
+ dropout: float = 0.0,
+ activation_fn: str = "geglu",
+ final_dropout: bool = False,
+ ):
super().__init__()
inner_dim = int(dim * mult)
dim_out = dim_out if dim_out is not None else dim
@@ -476,7 +459,7 @@ class GELU(nn.Layer):
GELU activation function with tanh approximation support with `approximate="tanh"`.
"""
- def __init__(self, dim_in: int, dim_out: int, approximate: str="none"):
+ def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
super().__init__()
self.proj = nn.Linear(dim_in, dim_out)
self.approximate = approximate
@@ -552,22 +535,17 @@ class AdaLayerNormZero(nn.Layer):
def __init__(self, embedding_dim, num_embeddings):
super().__init__()
- self.emb = CombinedTimestepLabelEmbeddings(num_embeddings,
- embedding_dim)
+ self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
self.silu = nn.Silu()
- self.linear = nn.Linear(
- embedding_dim, 6 * embedding_dim, bias_attr=True)
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
# elementwise_affine=False
norm_kwargs = {"weight_attr": False, "bias_attr": False}
self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
def forward(self, x, timestep, class_labels, hidden_dtype=None):
- emb = self.linear(
- self.silu(
- self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(
- 6, axis=1)
+ emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
@@ -578,12 +556,13 @@ class AdaGroupNorm(nn.Layer):
"""
def __init__(
- self,
- embedding_dim: int,
- out_dim: int,
- num_groups: int,
- act_fn: Optional[str]=None,
- eps: float=1e-5, ):
+ self,
+ embedding_dim: int,
+ out_dim: int,
+ num_groups: int,
+ act_fn: Optional[str] = None,
+ eps: float = 1e-5,
+ ):
super().__init__()
self.num_groups = num_groups
self.eps = eps
@@ -600,8 +579,7 @@ def __init__(
self.linear = nn.Linear(embedding_dim, out_dim * 2)
# elementwise_affine=False
norm_kwargs = {"weight_attr": False, "bias_attr": False}
- self.group_norm = nn.GroupNorm(
- num_groups, out_dim, epsilon=eps, **norm_kwargs)
+ self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
self.group_norm.weight = None
self.group_norm.bias = None
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
index 506c08b6c76b0..e2c4770f3398a 100644
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -40,27 +40,27 @@ class Attention(nn.Layer):
"""
def __init__(
- self,
- query_dim: int,
- cross_attention_dim: Optional[int]=None,
- heads: int=8,
- dim_head: int=64,
- dropout: float=0.0,
- bias=False,
- upcast_attention: bool=False,
- upcast_softmax: bool=False,
- cross_attention_norm: Optional[str]=None,
- cross_attention_norm_num_groups: int=32,
- added_kv_proj_dim: Optional[int]=None,
- norm_num_groups: Optional[int]=None,
- out_bias: bool=True,
- scale_qk: bool=True,
- only_cross_attention: bool=False,
- processor: Optional["AttnProcessor"]=None, ):
+ self,
+ query_dim: int,
+ cross_attention_dim: Optional[int] = None,
+ heads: int = 8,
+ dim_head: int = 64,
+ dropout: float = 0.0,
+ bias=False,
+ upcast_attention: bool = False,
+ upcast_softmax: bool = False,
+ cross_attention_norm: Optional[str] = None,
+ cross_attention_norm_num_groups: int = 32,
+ added_kv_proj_dim: Optional[int] = None,
+ norm_num_groups: Optional[int] = None,
+ out_bias: bool = True,
+ scale_qk: bool = True,
+ only_cross_attention: bool = False,
+ processor: Optional["AttnProcessor"] = None,
+ ):
super().__init__()
inner_dim = dim_head * heads
- cross_attention_dim = (cross_attention_dim if
- cross_attention_dim is not None else query_dim)
+ cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
@@ -82,10 +82,7 @@ def __init__(
)
if norm_num_groups is not None:
- self.group_norm = nn.GroupNorm(
- num_channels=query_dim,
- num_groups=norm_num_groups,
- epsilon=1e-5)
+ self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, epsilon=1e-5)
else:
self.group_norm = None
@@ -107,7 +104,8 @@ def __init__(
self.norm_cross = nn.GroupNorm(
num_channels=norm_cross_num_channels,
num_groups=cross_attention_norm_num_groups,
- epsilon=1e-5, )
+ epsilon=1e-5,
+ )
else:
raise ValueError(
f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
@@ -117,10 +115,8 @@ def __init__(
if not self.only_cross_attention:
# only relevant for the `AddedKVProcessor` classes
- self.to_k = nn.Linear(
- cross_attention_dim, inner_dim, bias_attr=bias)
- self.to_v = nn.Linear(
- cross_attention_dim, inner_dim, bias_attr=bias)
+ self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
+ self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
else:
self.to_k = None
self.to_v = None
@@ -140,15 +136,17 @@ def __init__(
self.set_processor(processor)
def set_use_memory_efficient_attention_xformers(
- self,
- use_memory_efficient_attention_xformers: bool,
- attention_op: Optional[str]=None, ):
+ self,
+ use_memory_efficient_attention_xformers: bool,
+ attention_op: Optional[str] = None,
+ ):
is_lora = hasattr(self, "processor") and isinstance(
- self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor))
+ self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
+ )
is_custom_diffusion = hasattr(self, "processor") and isinstance(
self.processor,
- (CustomDiffusionAttnProcessor,
- CustomDiffusionXFormersAttnProcessor), )
+ (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor),
+ )
is_added_kv = self.added_kv_proj_dim is not None
if use_memory_efficient_attention_xformers:
# if self.added_kv_proj_dim is not None:
@@ -167,13 +165,11 @@ def set_use_memory_efficient_attention_xformers(
try:
# Make sure we can run the memory efficient attention
_ = F.scaled_dot_product_attention_(
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- attention_op=attention_op, )
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ attention_op=attention_op,
+ )
except Exception as e:
raise e
# remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
@@ -184,7 +180,8 @@ def set_use_memory_efficient_attention_xformers(
hidden_size=self.processor.hidden_size,
cross_attention_dim=self.processor.cross_attention_dim,
rank=self.processor.rank,
- attention_op=attention_op, )
+ attention_op=attention_op,
+ )
# we must cast dtype
processor.to(dtype=self.dtype)
processor.load_dict(self.processor.state_dict())
@@ -194,13 +191,13 @@ def set_use_memory_efficient_attention_xformers(
train_q_out=self.processor.train_q_out,
hidden_size=self.processor.hidden_size,
cross_attention_dim=self.processor.cross_attention_dim,
- attention_op=attention_op, )
+ attention_op=attention_op,
+ )
# we must cast dtype
processor.to(dtype=self.dtype)
processor.load_dict(self.processor.state_dict())
elif is_added_kv:
- processor = XFormersAttnAddedKVProcessor(
- attention_op=attention_op)
+ processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
else:
processor = XFormersAttnProcessor(attention_op=attention_op)
else:
@@ -208,7 +205,8 @@ def set_use_memory_efficient_attention_xformers(
processor = LoRAAttnProcessor(
hidden_size=self.processor.hidden_size,
cross_attention_dim=self.processor.cross_attention_dim,
- rank=self.processor.rank, )
+ rank=self.processor.rank,
+ )
# we must cast dtype
processor.to(dtype=self.dtype)
processor.load_dict(self.processor.state_dict())
@@ -217,7 +215,8 @@ def set_use_memory_efficient_attention_xformers(
train_kv=self.processor.train_kv,
train_q_out=self.processor.train_q_out,
hidden_size=self.processor.hidden_size,
- cross_attention_dim=self.processor.cross_attention_dim, )
+ cross_attention_dim=self.processor.cross_attention_dim,
+ )
# we must cast dtype
processor.to(dtype=self.dtype)
processor.load_dict(self.processor.state_dict())
@@ -230,9 +229,7 @@ def set_use_memory_efficient_attention_xformers(
def set_attention_slice(self, slice_size):
if slice_size is not None and slice_size > self.sliceable_head_dim:
- raise ValueError(
- f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}."
- )
+ raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
if slice_size is not None and self.added_kv_proj_dim is not None:
processor = SlicedAttnAddedKVProcessor(slice_size)
@@ -248,22 +245,19 @@ def set_attention_slice(self, slice_size):
def set_processor(self, processor: "AttnProcessor"):
# if current processor is in `self._sub_layers` and if passed `processor` is not, we need to
# pop `processor` from `self._sub_layers`
- if (hasattr(self, "processor") and
- isinstance(self.processor, nn.Layer) and
- not isinstance(processor, nn.Layer)):
- logger.info(
- f"You are removing possibly trained weights of {self.processor} with {processor}"
- )
+ if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer):
+ logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
self._sub_layers.pop("processor")
self.processor = processor
def forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
# The `Attention` class can call different attention processors / attention functions
# here we simply pass along all tensors to the selected processor class
# For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -272,14 +266,14 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
def batch_to_head_dim(self, tensor, transpose=True, in_dim=4):
if in_dim == 3:
head_size = self.heads
batch_size, seq_len, dim = tensor.shape
- tensor = tensor.reshape(
- [batch_size // head_size, head_size, seq_len, dim])
+ tensor = tensor.reshape([batch_size // head_size, head_size, seq_len, dim])
if transpose:
tensor = tensor.transpose([0, 2, 1, 3])
tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
@@ -301,8 +295,7 @@ def get_attention_scores(self, query, key, attention_mask=None):
query = query.cast(paddle.float32)
key = key.cast(paddle.float32)
- attention_scores = paddle.matmul(
- query, key, transpose_y=True) * self.scale
+ attention_scores = paddle.matmul(query, key, transpose_y=True) * self.scale
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
@@ -317,12 +310,7 @@ def get_attention_scores(self, query, key, attention_mask=None):
return attention_probs
- def prepare_attention_mask(self,
- attention_mask,
- target_length,
- batch_size=None,
- out_dim=4,
- transpose=True):
+ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=4, transpose=True):
if batch_size is None:
deprecate(
"batch_size=None",
@@ -331,7 +319,8 @@ def prepare_attention_mask(self,
"Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
" attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
" `prepare_attention_mask` when preparing the attention_mask."
- ), )
+ ),
+ )
batch_size = 1
num_heads = self.heads
@@ -339,21 +328,15 @@ def prepare_attention_mask(self,
return attention_mask
if attention_mask.shape[-1] != target_length:
- attention_mask = F.pad(attention_mask, (0, target_length),
- value=0.0,
- data_format="NCL")
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0, data_format="NCL")
if out_dim == 3:
if attention_mask.shape[0] < batch_size * num_heads:
- attention_mask = attention_mask.repeat_interleave(
- num_heads, axis=0)
+ attention_mask = attention_mask.repeat_interleave(num_heads, axis=0)
elif out_dim == 4:
attention_mask = attention_mask.unsqueeze(1)
if attention_mask.shape[0] < batch_size * num_heads:
- attention_mask = attention_mask.repeat_interleave(
- num_heads, axis=1)
- attention_mask = paddle.reshape(
- attention_mask,
- [batch_size, num_heads, -1, attention_mask.shape[-1]])
+ attention_mask = attention_mask.repeat_interleave(num_heads, axis=1)
+ attention_mask = paddle.reshape(attention_mask, [batch_size, num_heads, -1, attention_mask.shape[-1]])
if attention_mask.ndim == 4:
if not transpose:
@@ -361,9 +344,7 @@ def prepare_attention_mask(self,
return attention_mask
def norm_encoder_hidden_states(self, encoder_hidden_states):
- assert (
- self.norm_cross is not None
- ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+ assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
if isinstance(self.norm_cross, nn.LayerNorm):
encoder_hidden_states = self.norm_cross(encoder_hidden_states)
@@ -384,24 +365,23 @@ def norm_encoder_hidden_states(self, encoder_hidden_states):
class AttnProcessor:
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -427,9 +407,7 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None):
super().__init__()
if rank > min(in_features, out_features):
- raise ValueError(
- f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}"
- )
+ raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
self.down = nn.Linear(in_features, rank, bias_attr=False)
self.up = nn.Linear(rank, out_features, bias_attr=False)
@@ -469,39 +447,31 @@ class LoRAAttnProcessor(nn.Layer):
Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
"""
- def __init__(self,
- hidden_size,
- cross_attention_dim=None,
- rank=4,
- network_alpha=None):
+ def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
super().__init__()
self.hidden_size = hidden_size
self.cross_attention_dim = cross_attention_dim
self.rank = rank
- self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
- network_alpha)
- self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
- hidden_size, rank, network_alpha)
- self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
- hidden_size, rank, network_alpha)
- self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
- network_alpha)
+ self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+ self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- scale=1.0,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ scale=1.0,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query)
@@ -509,13 +479,10 @@ def __call__(
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(
- encoder_hidden_states)
- value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(
- encoder_hidden_states)
+ key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
@@ -525,8 +492,7 @@ def __call__(
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
- hidden_states = attn.to_out[0](
- hidden_states) + scale * self.to_out_lora(hidden_states)
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
@@ -535,13 +501,14 @@ def __call__(
class CustomDiffusionAttnProcessor(nn.Layer):
def __init__(
- self,
- train_kv=True,
- train_q_out=True,
- hidden_size=None,
- cross_attention_dim=None,
- out_bias=True,
- dropout=0.0, ):
+ self,
+ train_kv=True,
+ train_q_out=True,
+ hidden_size=None,
+ cross_attention_dim=None,
+ out_bias=True,
+ dropout=0.0,
+ ):
super().__init__()
self.train_kv = train_kv
self.train_q_out = train_q_out
@@ -551,35 +518,26 @@ def __init__(
# `_custom_diffusion` id for easy serialization and loading.
if self.train_kv:
- self.to_k_custom_diffusion = nn.Linear(
- cross_attention_dim or hidden_size,
- hidden_size,
- bias_attr=False)
- self.to_v_custom_diffusion = nn.Linear(
- cross_attention_dim or hidden_size,
- hidden_size,
- bias_attr=False)
+ self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
+ self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
if self.train_q_out:
- self.to_q_custom_diffusion = nn.Linear(
- hidden_size, hidden_size, bias_attr=False)
+ self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
self.to_out_custom_diffusion = nn.LayerList([])
- self.to_out_custom_diffusion.append(
- nn.Linear(
- hidden_size, hidden_size, bias_attr=out_bias))
+ self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
self.to_out_custom_diffusion.append(nn.Dropout(dropout))
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if self.train_q_out:
query = self.to_q_custom_diffusion(hidden_states)
else:
@@ -591,8 +549,7 @@ def __call__(
else:
crossattn = True
if attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
if self.train_kv:
key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -631,40 +588,35 @@ def __call__(
class AttnAddedKVProcessor:
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
residual = hidden_states
- hidden_states = hidden_states.reshape(
- [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
- [0, 2, 1])
+ hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+ [0, 2, 1]
+ )
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- hidden_states = attn.group_norm(hidden_states.transpose(
- [0, 2, 1])).transpose([0, 2, 1])
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
query = attn.to_q(hidden_states)
query = attn.head_to_batch_dim(query)
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
- encoder_hidden_states_value_proj = attn.add_v_proj(
- encoder_hidden_states)
- encoder_hidden_states_key_proj = attn.head_to_batch_dim(
- encoder_hidden_states_key_proj)
- encoder_hidden_states_value_proj = attn.head_to_batch_dim(
- encoder_hidden_states_value_proj)
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+ encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+ encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
if not attn.only_cross_attention:
key = attn.to_k(hidden_states)
@@ -672,8 +624,7 @@ def __call__(
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
- value = paddle.concat(
- [encoder_hidden_states_value_proj, value], axis=2)
+ value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
else:
key = encoder_hidden_states_key_proj
value = encoder_hidden_states_value_proj
@@ -687,53 +638,47 @@ def __call__(
# dropout
hidden_states = attn.to_out[1](hidden_states)
- hidden_states = hidden_states.transpose(
- [0, 2, 1]).reshape(residual.shape)
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
hidden_states = hidden_states + residual
return hidden_states
class XFormersAttnAddedKVProcessor:
- def __init__(self, attention_op: Optional[str]=None):
+ def __init__(self, attention_op: Optional[str] = None):
assert attention_op in [None, "auto", "cutlass", "flash"]
self.attention_op = attention_op
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
residual = hidden_states
- hidden_states = hidden_states.reshape(
- [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
- [0, 2, 1])
+ hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+ [0, 2, 1]
+ )
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, transpose=False)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- hidden_states = attn.group_norm(hidden_states.transpose(
- [0, 2, 1])).transpose([0, 2, 1])
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
query = attn.to_q(hidden_states)
query = attn.head_to_batch_dim(query, transpose=False)
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
- encoder_hidden_states_value_proj = attn.add_v_proj(
- encoder_hidden_states)
- encoder_hidden_states_key_proj = attn.head_to_batch_dim(
- encoder_hidden_states_key_proj, transpose=False)
- encoder_hidden_states_value_proj = attn.head_to_batch_dim(
- encoder_hidden_states_value_proj, transpose=False)
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+ encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, transpose=False)
+ encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, transpose=False)
if not attn.only_cross_attention:
key = attn.to_k(hidden_states)
@@ -741,8 +686,7 @@ def __call__(
key = attn.head_to_batch_dim(key, transpose=False)
value = attn.head_to_batch_dim(value, transpose=False)
key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
- value = paddle.concat(
- [encoder_hidden_states_value_proj, value], axis=1)
+ value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
else:
key = encoder_hidden_states_key_proj
value = encoder_hidden_states_value_proj
@@ -755,7 +699,8 @@ def __call__(
scale=attn.scale,
dropout_p=0.0,
training=attn.training,
- attention_op=self.attention_op, )
+ attention_op=self.attention_op,
+ )
hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
# linear proj
@@ -763,39 +708,37 @@ def __call__(
# dropout
hidden_states = attn.to_out[1](hidden_states)
- hidden_states = hidden_states.transpose(
- [0, 2, 1]).reshape(residual.shape)
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
hidden_states = hidden_states + residual
return hidden_states
class XFormersAttnProcessor:
- def __init__(self, attention_op: Optional[str]=None):
+ def __init__(self, attention_op: Optional[str] = None):
assert attention_op in [None, "auto", "cutlass", "flash"]
self.attention_op = attention_op
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
-
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, transpose=False)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -813,7 +756,8 @@ def __call__(
scale=attn.scale,
dropout_p=0.0,
training=attn.training,
- attention_op=self.attention_op, )
+ attention_op=self.attention_op,
+ )
# hidden_states = hidden_states.cast(query.dtype)
hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
@@ -847,12 +791,13 @@ class LoRAXFormersAttnProcessor(nn.Layer):
"""
def __init__(
- self,
- hidden_size,
- cross_attention_dim,
- rank=4,
- attention_op: Optional[str]=None,
- network_alpha=None, ):
+ self,
+ hidden_size,
+ cross_attention_dim,
+ rank=4,
+ attention_op: Optional[str] = None,
+ network_alpha=None,
+ ):
super().__init__()
self.hidden_size = hidden_size
@@ -860,28 +805,24 @@ def __init__(
self.rank = rank
self.attention_op = attention_op
- self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
- network_alpha)
- self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
- hidden_size, rank, network_alpha)
- self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
- hidden_size, rank, network_alpha)
- self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
- network_alpha)
+ self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+ self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- scale=1.0,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, transpose=False)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ scale=1.0,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query, transpose=False)
@@ -889,13 +830,10 @@ def __call__(
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(
- encoder_hidden_states)
- value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(
- encoder_hidden_states)
+ key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
key = attn.head_to_batch_dim(key, transpose=False)
value = attn.head_to_batch_dim(value, transpose=False)
@@ -908,13 +846,13 @@ def __call__(
scale=attn.scale,
dropout_p=0.0,
training=attn.training,
- attention_op=self.attention_op, )
+ attention_op=self.attention_op,
+ )
hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
# linear proj
- hidden_states = attn.to_out[0](
- hidden_states) + scale * self.to_out_lora(hidden_states)
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
@@ -923,14 +861,15 @@ def __call__(
class CustomDiffusionXFormersAttnProcessor(nn.Layer):
def __init__(
- self,
- train_kv=True,
- train_q_out=False,
- hidden_size=None,
- cross_attention_dim=None,
- out_bias=True,
- dropout=0.0,
- attention_op: Optional[str]=None, ):
+ self,
+ train_kv=True,
+ train_q_out=False,
+ hidden_size=None,
+ cross_attention_dim=None,
+ out_bias=True,
+ dropout=0.0,
+ attention_op: Optional[str] = None,
+ ):
super().__init__()
assert attention_op in [None, "auto", "cutlass", "flash"]
self.train_kv = train_kv
@@ -942,36 +881,27 @@ def __init__(
# `_custom_diffusion` id for easy serialization and loading.
if self.train_kv:
- self.to_k_custom_diffusion = nn.Linear(
- cross_attention_dim or hidden_size,
- hidden_size,
- bias_attr=False)
- self.to_v_custom_diffusion = nn.Linear(
- cross_attention_dim or hidden_size,
- hidden_size,
- bias_attr=False)
+ self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
+ self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
if self.train_q_out:
- self.to_q_custom_diffusion = nn.Linear(
- hidden_size, hidden_size, bias_attr=False)
+ self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
self.to_out_custom_diffusion = nn.LayerList([])
- self.to_out_custom_diffusion.append(
- nn.Linear(
- hidden_size, hidden_size, bias_attr=out_bias))
+ self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
self.to_out_custom_diffusion.append(nn.Dropout(dropout))
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
-
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, transpose=False)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
if self.train_q_out:
query = self.to_q_custom_diffusion(hidden_states)
@@ -984,8 +914,7 @@ def __call__(
else:
crossattn = True
if attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
if self.train_kv:
key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -1013,7 +942,8 @@ def __call__(
scale=attn.scale,
dropout_p=0.0,
training=attn.training,
- attention_op=self.attention_op, )
+ attention_op=self.attention_op,
+ )
# hidden_states = hidden_states.cast(query.dtype)
hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
@@ -1035,17 +965,17 @@ def __init__(self, slice_size):
self.slice_size = slice_size
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
- batch_size, sequence_length, _ = (hidden_states.shape
- if encoder_hidden_states is None else
- encoder_hidden_states.shape)
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, out_dim=3)
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
query = attn.to_q(hidden_states)
query = attn.head_to_batch_dim(query)
@@ -1053,8 +983,7 @@ def __call__(
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -1067,27 +996,23 @@ def __call__(
batch_size_attention = query.shape[0]
query_len = query.shape[1]
- hidden_states = paddle.zeros(
- (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
+ hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
for i in range(batch_size_attention // self.slice_size):
start_idx = i * self.slice_size
end_idx = (i + 1) * self.slice_size
query_slice = query[start_idx:end_idx]
key_slice = key[start_idx:end_idx]
- attn_mask_slice = (attention_mask[start_idx:end_idx]
- if attention_mask is not None else None)
+ attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
- attn_slice = attn.get_attention_scores(query_slice, key_slice,
- attn_mask_slice)
+ attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
hidden_states[start_idx:end_idx] = attn_slice
# reshape back to [bs, num_heads, seqlen, head_dim]
- hidden_states = hidden_states.reshape(
- [-1, attn.heads, query_len, attn.head_dim])
+ hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
@@ -1103,42 +1028,37 @@ def __init__(self, slice_size):
self.slice_size = slice_size
def __call__(
- self,
- attn: "Attention",
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- **cross_attention_kwargs, ):
+ self,
+ attn: "Attention",
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ **cross_attention_kwargs,
+ ):
residual = hidden_states
- hidden_states = hidden_states.reshape(
- [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
- [0, 2, 1])
+ hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+ [0, 2, 1]
+ )
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size, out_dim=3)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- hidden_states = attn.group_norm(hidden_states.transpose(
- [0, 2, 1])).transpose([0, 2, 1])
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
query = attn.to_q(hidden_states)
query = attn.head_to_batch_dim(query)
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
- encoder_hidden_states_value_proj = attn.add_v_proj(
- encoder_hidden_states)
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
- encoder_hidden_states_key_proj = attn.head_to_batch_dim(
- encoder_hidden_states_key_proj)
- encoder_hidden_states_value_proj = attn.head_to_batch_dim(
- encoder_hidden_states_value_proj)
+ encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+ encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
if not attn.only_cross_attention:
key = attn.to_k(hidden_states)
@@ -1146,8 +1066,7 @@ def __call__(
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
- value = paddle.concat(
- [encoder_hidden_states_value_proj, value], axis=2)
+ value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
else:
key = encoder_hidden_states_key_proj
value = encoder_hidden_states_value_proj
@@ -1159,8 +1078,7 @@ def __call__(
batch_size_attention = query.shape[0]
query_len = query.shape[1]
- hidden_states = paddle.zeros(
- (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
+ hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
for i in range(batch_size_attention // self.slice_size):
start_idx = i * self.slice_size
@@ -1168,19 +1086,16 @@ def __call__(
query_slice = query[start_idx:end_idx]
key_slice = key[start_idx:end_idx]
- attn_mask_slice = (attention_mask[start_idx:end_idx]
- if attention_mask is not None else None)
+ attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
- attn_slice = attn.get_attention_scores(query_slice, key_slice,
- attn_mask_slice)
+ attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
hidden_states[start_idx:end_idx] = attn_slice
# reshape back to [bs, num_heads, seqlen, head_dim]
- hidden_states = hidden_states.reshape(
- [-1, attn.heads, query_len, attn.head_dim])
+ hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
hidden_states = attn.batch_to_head_dim(hidden_states)
@@ -1189,8 +1104,7 @@ def __call__(
# dropout
hidden_states = attn.to_out[1](hidden_states)
- hidden_states = hidden_states.transpose(
- [0, 2, 1]).reshape(residual.shape)
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
hidden_states = hidden_states + residual
return hidden_states
@@ -1200,9 +1114,17 @@ def __call__(
AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
AttentionProcessor = Union[
- AttnProcessor, AttnProcessor2_5, XFormersAttnProcessor, SlicedAttnProcessor,
- AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5,
- XFormersAttnAddedKVProcessor, LoRAAttnProcessor, LoRAXFormersAttnProcessor,
- LoRAAttnProcessor2_5, CustomDiffusionAttnProcessor,
+ AttnProcessor,
+ AttnProcessor2_5,
+ XFormersAttnProcessor,
+ SlicedAttnProcessor,
+ AttnAddedKVProcessor,
+ SlicedAttnAddedKVProcessor,
+ AttnAddedKVProcessor2_5,
+ XFormersAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ LoRAAttnProcessor2_5,
+ CustomDiffusionAttnProcessor,
CustomDiffusionXFormersAttnProcessor,
]
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
index 3d3b531d927e3..69d1b0fb98bb2 100644
--- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+++ b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
@@ -69,29 +69,30 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- in_channels: int=3,
- out_channels: int=3,
- down_block_types: Tuple[str]=("DownEncoderBlock2D", ),
- down_block_out_channels: Tuple[int]=None,
- up_block_types: Tuple[str]=("UpDecoderBlock2D", ),
- up_block_out_channels: Tuple[int]=None,
- block_out_channels: Tuple[int]=(64, ),
- layers_per_block: int=1,
- act_fn: str="silu",
- latent_channels: int=4,
- norm_num_groups: int=32,
- sample_size: int=32,
- scaling_factor: float=0.18215, ):
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+ down_block_out_channels: Tuple[int] = None,
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+ up_block_out_channels: Tuple[int] = None,
+ block_out_channels: Tuple[int] = (64,),
+ layers_per_block: int = 1,
+ act_fn: str = "silu",
+ latent_channels: int = 4,
+ norm_num_groups: int = 32,
+ sample_size: int = 32,
+ scaling_factor: float = 0.18215,
+ ):
super().__init__()
# if down_block_out_channels not givien, we will use block_out_channels
- _down_block_out_channels = (self.config.block_out_channels
- if down_block_out_channels is None else
- self.config.down_block_out_channels)
+ _down_block_out_channels = (
+ self.config.block_out_channels if down_block_out_channels is None else self.config.down_block_out_channels
+ )
# if up_block_out_channels not givien, we will use block_out_channels
- _up_block_out_channels = (self.config.block_out_channels
- if up_block_out_channels is None else
- self.config.up_block_out_channels)
+ _up_block_out_channels = (
+ self.config.block_out_channels if up_block_out_channels is None else self.config.up_block_out_channels
+ )
# pass init params to Encoder
self.encoder = Encoder(
@@ -102,7 +103,8 @@ def __init__(
layers_per_block=layers_per_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
- double_z=True, )
+ double_z=True,
+ )
# pass init params to Decoder
self.decoder = Decoder(
@@ -112,7 +114,8 @@ def __init__(
block_out_channels=_up_block_out_channels,
layers_per_block=layers_per_block,
norm_num_groups=norm_num_groups,
- act_fn=act_fn, )
+ act_fn=act_fn,
+ )
self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
@@ -122,18 +125,19 @@ def __init__(
# only relevant if vae tiling is enabled
self.tile_sample_min_size = self.config.sample_size
- sample_size = (self.config.sample_size[0]
- if isinstance(self.config.sample_size, (list, tuple))
- else self.config.sample_size)
- self.tile_latent_min_size = int(sample_size /
- (2**(len(_up_block_out_channels) - 1)))
+ sample_size = (
+ self.config.sample_size[0]
+ if isinstance(self.config.sample_size, (list, tuple))
+ else self.config.sample_size
+ )
+ self.tile_latent_min_size = int(sample_size / (2 ** (len(_up_block_out_channels) - 1)))
self.tile_overlap_factor = 0.25
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (Encoder, Decoder)):
module.gradient_checkpointing = value
- def enable_tiling(self, use_tiling: bool=True):
+ def enable_tiling(self, use_tiling: bool = True):
r"""
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful to save a large amount of memory and to allow
@@ -163,12 +167,10 @@ def disable_slicing(self):
self.use_slicing = False
@apply_forward_hook
- def encode(self, x: paddle.Tensor,
- return_dict: bool=True) -> AutoencoderKLOutput:
+ def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
# TODO junnyu, support float16
x = x.cast(self.encoder.conv_in.weight.dtype)
- if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or
- x.shape[-2] > self.tile_sample_min_size):
+ if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
return self.tiled_encode(x, return_dict=return_dict)
h = self.encoder(x)
@@ -176,57 +178,49 @@ def encode(self, x: paddle.Tensor,
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
- return (posterior, )
+ return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
- def _decode(self, z: paddle.Tensor,
- return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
- if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or
- z.shape[-2] > self.tile_latent_min_size):
+ def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+ if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
return self.tiled_decode(z, return_dict=return_dict)
z = self.post_quant_conv(z)
dec = self.decoder(z)
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
@apply_forward_hook
- def decode(self, z: paddle.Tensor,
- return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
+ def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
# TODO junnyu, add this to support pure fp16
z = z.cast(self.post_quant_conv.weight.dtype)
if self.use_slicing and z.shape[0] > 1:
# split、chunk paddle vs pytorch may have some difference
- decoded_slices = [
- self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])
- ]
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
decoded = paddle.concat(decoded_slices)
else:
decoded = self._decode(z).sample
if not return_dict:
- return (decoded, )
+ return (decoded,)
return DecoderOutput(sample=decoded)
def blend_v(self, a, b, blend_extent):
for y in range(min(a.shape[2], b.shape[2], blend_extent)):
- b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (
- 1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+ b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
return b
def blend_h(self, a, b, blend_extent):
for x in range(min(a.shape[3], b.shape[3], blend_extent)):
- b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (
- 1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+ b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
return b
- def tiled_encode(self, x: paddle.Tensor,
- return_dict: bool=True) -> AutoencoderKLOutput:
+ def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
r"""Encode a batch of images using a tiled encoder.
Args:
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -237,8 +231,7 @@ def tiled_encode(self, x: paddle.Tensor,
x (`paddle.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`AutoencoderKLOutput`] instead of a plain tuple.
"""
- overlap_size = int(self.tile_sample_min_size *
- (1 - self.tile_overlap_factor))
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
row_limit = self.tile_latent_min_size - blend_extent
@@ -247,8 +240,12 @@ def tiled_encode(self, x: paddle.Tensor,
for i in range(0, x.shape[2], overlap_size):
row = []
for j in range(0, x.shape[3], overlap_size):
- tile = x[:, :, i:i + self.tile_sample_min_size, j:j +
- self.tile_sample_min_size, ]
+ tile = x[
+ :,
+ :,
+ i : i + self.tile_sample_min_size,
+ j : j + self.tile_sample_min_size,
+ ]
tile = self.encoder(tile)
tile = self.quant_conv(tile)
row.append(tile)
@@ -270,13 +267,11 @@ def tiled_encode(self, x: paddle.Tensor,
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
- return (posterior, )
+ return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
- def tiled_decode(
- self, z: paddle.Tensor,
- return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
+ def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
r"""Decode a batch of images using a tiled decoder.
Args:
When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
@@ -288,8 +283,7 @@ def tiled_decode(
`True`):
Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
"""
- overlap_size = int(self.tile_latent_min_size *
- (1 - self.tile_overlap_factor))
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
row_limit = self.tile_sample_min_size - blend_extent
@@ -299,8 +293,12 @@ def tiled_decode(
for i in range(0, z.shape[2], overlap_size):
row = []
for j in range(0, z.shape[3], overlap_size):
- tile = z[:, :, i:i + self.tile_latent_min_size, j:j +
- self.tile_latent_min_size, ]
+ tile = z[
+ :,
+ :,
+ i : i + self.tile_latent_min_size,
+ j : j + self.tile_latent_min_size,
+ ]
tile = self.post_quant_conv(tile)
decoded = self.decoder(tile)
row.append(decoded)
@@ -320,17 +318,17 @@ def tiled_decode(
dec = paddle.concat(result_rows, axis=2)
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
def forward(
- self,
- sample: paddle.Tensor,
- sample_posterior: bool=False,
- return_dict: bool=True,
- generator: Optional[paddle.Generator]=None, ) -> Union[
- DecoderOutput, paddle.Tensor]:
+ self,
+ sample: paddle.Tensor,
+ sample_posterior: bool = False,
+ return_dict: bool = True,
+ generator: Optional[paddle.Generator] = None,
+ ) -> Union[DecoderOutput, paddle.Tensor]:
r"""
Args:
sample (`paddle.Tensor`): Input sample.
@@ -348,6 +346,6 @@ def forward(
dec = self.decode(z).sample
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py
index 6662f2904992c..2ac640f58f21e 100644
--- a/ppdiffusers/ppdiffusers/models/controlnet.py
+++ b/ppdiffusers/ppdiffusers/models/controlnet.py
@@ -25,8 +25,12 @@
from .attention_processor import AttentionProcessor, AttnProcessor
from .embeddings import TimestepEmbedding, Timesteps
from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (CrossAttnDownBlock2D, DownBlock2D,
- UNetMidBlock2DCrossAttn, get_down_block)
+from .unet_2d_blocks import (
+ CrossAttnDownBlock2D,
+ DownBlock2D,
+ UNetMidBlock2DCrossAttn,
+ get_down_block,
+)
from .unet_2d_condition import UNet2DConditionModel
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -54,37 +58,31 @@ class ControlNetConditioningEmbedding(nn.Layer):
"""
def __init__(
- self,
- conditioning_embedding_channels: int,
- conditioning_channels: int=3,
- block_out_channels: Tuple[int]=(16, 32, 96, 256), ):
+ self,
+ conditioning_embedding_channels: int,
+ conditioning_channels: int = 3,
+ block_out_channels: Tuple[int] = (16, 32, 96, 256),
+ ):
super().__init__()
- self.conv_in = nn.Conv2D(
- conditioning_channels,
- block_out_channels[0],
- kernel_size=3,
- padding=1)
+ self.conv_in = nn.Conv2D(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
self.blocks = nn.LayerList([])
for i in range(len(block_out_channels) - 1):
channel_in = block_out_channels[i]
channel_out = block_out_channels[i + 1]
- self.blocks.append(
- nn.Conv2D(
- channel_in, channel_in, kernel_size=3, padding=1))
- self.blocks.append(
- nn.Conv2D(
- channel_in, channel_out, kernel_size=3, padding=1,
- stride=2))
+ self.blocks.append(nn.Conv2D(channel_in, channel_in, kernel_size=3, padding=1))
+ self.blocks.append(nn.Conv2D(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
self.conv_out = zero_module(
nn.Conv2D(
block_out_channels[-1],
conditioning_embedding_channels,
kernel_size=3,
- padding=1, ))
+ padding=1,
+ )
+ )
def forward(self, conditioning):
embedding = self.conv_in(conditioning)
@@ -104,36 +102,37 @@ class ControlNetModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- in_channels: int=4,
- flip_sin_to_cos: bool=True,
- freq_shift: int=0,
- down_block_types: Tuple[str]=(
- "CrossAttnDownBlock2D",
- "CrossAttnDownBlock2D",
- "CrossAttnDownBlock2D",
- "DownBlock2D", ),
- only_cross_attention: Union[bool, Tuple[bool]]=False,
- block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
- layers_per_block: int=2,
- downsample_padding: int=1,
- mid_block_scale_factor: float=1,
- act_fn: str="silu",
- norm_num_groups: Optional[int]=32,
- norm_eps: float=1e-5,
- cross_attention_dim: int=1280,
- attention_head_dim: Union[int, Tuple[int]]=8,
- use_linear_projection: bool=False,
- class_embed_type: Optional[str]=None,
- num_class_embeds: Optional[int]=None,
- upcast_attention: bool=False,
- resnet_time_scale_shift: str="default",
- projection_class_embeddings_input_dim: Optional[int]=None,
- controlnet_conditioning_channel_order: str="rgb",
- conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32,
- 96, 256),
- global_pool_conditions: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int = 4,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "DownBlock2D",
+ ),
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: int = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: Optional[int] = 32,
+ norm_eps: float = 1e-5,
+ cross_attention_dim: int = 1280,
+ attention_head_dim: Union[int, Tuple[int]] = 8,
+ use_linear_projection: bool = False,
+ class_embed_type: Optional[str] = None,
+ num_class_embeds: Optional[int] = None,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ projection_class_embeddings_input_dim: Optional[int] = None,
+ controlnet_conditioning_channel_order: str = "rgb",
+ conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+ global_pool_conditions: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
# Check inputs
@@ -142,16 +141,12 @@ def __init__(
f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- only_cross_attention,
- bool) and len(only_cross_attention) != len(down_block_types):
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- attention_head_dim,
- int) and len(attention_head_dim) != len(down_block_types):
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
)
@@ -163,27 +158,26 @@ def __init__(
in_channels,
block_out_channels[0],
kernel_size=conv_in_kernel,
- padding=conv_in_padding, )
+ padding=conv_in_padding,
+ )
# time
time_embed_dim = block_out_channels[0] * 4
- self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
- freq_shift)
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
self.time_embedding = TimestepEmbedding(
timestep_input_dim,
time_embed_dim,
- act_fn=act_fn, )
+ act_fn=act_fn,
+ )
# class embedding
if class_embed_type is None and num_class_embeds is not None:
- self.class_embedding = nn.Embedding(num_class_embeds,
- time_embed_dim)
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
elif class_embed_type == "timestep":
- self.class_embedding = TimestepEmbedding(timestep_input_dim,
- time_embed_dim)
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
elif class_embed_type == "identity":
self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
elif class_embed_type == "projection":
@@ -198,25 +192,24 @@ def __init__(
# Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
# When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
# As a result, `TimestepEmbedding` can be passed arbitrary vectors.
- self.class_embedding = TimestepEmbedding(
- projection_class_embeddings_input_dim, time_embed_dim)
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
else:
self.class_embedding = None
# control net conditioning embedding
self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
conditioning_embedding_channels=block_out_channels[0],
- block_out_channels=conditioning_embedding_out_channels, )
+ block_out_channels=conditioning_embedding_out_channels,
+ )
self.down_blocks = nn.LayerList([])
self.controlnet_down_blocks = nn.LayerList([])
if isinstance(only_cross_attention, bool):
- only_cross_attention = [only_cross_attention] * len(
- down_block_types)
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
if isinstance(attention_head_dim, int):
- attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
# pre_temb_act_fun opt
self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
@@ -233,8 +226,7 @@ def __init__(
# down
output_channel = block_out_channels[0]
- controlnet_block = nn.Conv2D(
- output_channel, output_channel, kernel_size=1)
+ controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
controlnet_block = zero_module(controlnet_block)
self.controlnet_down_blocks.append(controlnet_block)
@@ -260,27 +252,24 @@ def __init__(
only_cross_attention=only_cross_attention[i],
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=self.
- resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
+ )
self.down_blocks.append(down_block)
for _ in range(layers_per_block):
- controlnet_block = nn.Conv2D(
- output_channel, output_channel, kernel_size=1)
+ controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
controlnet_block = zero_module(controlnet_block)
self.controlnet_down_blocks.append(controlnet_block)
if not is_final_block:
- controlnet_block = nn.Conv2D(
- output_channel, output_channel, kernel_size=1)
+ controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
controlnet_block = zero_module(controlnet_block)
self.controlnet_down_blocks.append(controlnet_block)
# mid
mid_block_channel = block_out_channels[-1]
- controlnet_block = nn.Conv2D(
- mid_block_channel, mid_block_channel, kernel_size=1)
+ controlnet_block = nn.Conv2D(mid_block_channel, mid_block_channel, kernel_size=1)
controlnet_block = zero_module(controlnet_block)
self.controlnet_mid_block = controlnet_block
@@ -296,16 +285,17 @@ def __init__(
resnet_groups=norm_num_groups,
use_linear_projection=use_linear_projection,
upcast_attention=upcast_attention,
- resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
+ )
@classmethod
def from_unet(
- cls,
- unet: UNet2DConditionModel,
- controlnet_conditioning_channel_order: str="rgb",
- conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32,
- 96, 256),
- load_weights_from_unet: bool=True, ):
+ cls,
+ unet: UNet2DConditionModel,
+ controlnet_conditioning_channel_order: str = "rgb",
+ conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+ load_weights_from_unet: bool = True,
+ ):
r"""
Instantiate Controlnet class from UNet2DConditionModel.
Parameters:
@@ -333,22 +323,19 @@ def from_unet(
num_class_embeds=unet.config.num_class_embeds,
upcast_attention=unet.config.upcast_attention,
resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
- projection_class_embeddings_input_dim=unet.config.
- projection_class_embeddings_input_dim,
+ projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
conditioning_embedding_out_channels=conditioning_embedding_out_channels,
- resnet_pre_temb_non_linearity=unet.config.
- resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=unet.config.resnet_pre_temb_non_linearity,
+ )
if load_weights_from_unet:
controlnet.conv_in.load_dict(unet.conv_in.state_dict())
controlnet.time_proj.load_dict(unet.time_proj.state_dict())
- controlnet.time_embedding.load_dict(unet.time_embedding.state_dict(
- ))
+ controlnet.time_embedding.load_dict(unet.time_embedding.state_dict())
if controlnet.class_embedding:
- controlnet.class_embedding.load_dict(
- unet.class_embedding.state_dict())
+ controlnet.class_embedding.load_dict(unet.class_embedding.state_dict())
controlnet.down_blocks.load_dict(unet.down_blocks.state_dict())
controlnet.mid_block.load_dict(unet.mid_block.state_dict())
@@ -365,16 +352,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
# set recursively
processors = {}
- def fn_recursive_add_processors(
- name: str,
- module: nn.Layer,
- processors: Dict[str, AttentionProcessor]):
+ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
if hasattr(module, "set_processor"):
processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
- fn_recursive_add_processors(f"{name}.{sub_name}", child,
- processors)
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
return processors
@@ -383,9 +366,7 @@ def fn_recursive_add_processors(
return processors
- def set_attn_processor(self,
- processor: Union[AttentionProcessor, Dict[
- str, AttentionProcessor]]):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Parameters:
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -409,8 +390,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
- fn_recursive_attn_processor(f"{name}.{sub_name}", child,
- processor)
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
@@ -457,8 +437,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
# make smallest slice possible
slice_size = num_sliceable_layers * [1]
- slice_size = (num_sliceable_layers * [slice_size]
- if not isinstance(slice_size, list) else slice_size)
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
@@ -470,14 +449,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
size = slice_size[i]
dim = sliceable_head_dims[i]
if size is not None and size > dim:
- raise ValueError(
- f"size {size} has to be smaller or equal to {dim}.")
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
# Recursively walk through all the children.
# Any children which exposes the set_attention_slice method
# gets the message
- def fn_recursive_set_attention_slice(module: nn.Layer,
- slice_size: List[int]):
+ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
if hasattr(module, "set_attention_slice"):
module.set_attention_slice(slice_size.pop())
@@ -493,18 +470,19 @@ def _set_gradient_checkpointing(self, module, value=False):
module.gradient_checkpointing = value
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- controlnet_cond: paddle.Tensor,
- conditioning_scale: Union[List[float], float]=1.0,
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- guess_mode: bool=False,
- return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ controlnet_cond: paddle.Tensor,
+ conditioning_scale: Union[List[float], float] = 1.0,
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guess_mode: bool = False,
+ return_dict: bool = True,
+ ) -> Union[ControlNetOutput, Tuple]:
# TODO junnyu, add this to support pure fp16
sample = sample.cast(self.dtype)
@@ -517,9 +495,7 @@ def forward(
elif channel_order == "bgr":
controlnet_cond = paddle.flip(controlnet_cond, axis=[1])
else:
- raise ValueError(
- f"unknown `controlnet_conditioning_channel_order`: {channel_order}"
- )
+ raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
# prepare attention_mask
if attention_mask is not None:
@@ -534,7 +510,11 @@ def forward(
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps.expand([sample.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ sample.shape[0],
+ ]
+ )
t_emb = self.time_proj(timesteps)
@@ -547,8 +527,7 @@ def forward(
if self.class_embedding is not None:
if class_labels is None:
- raise ValueError(
- "class_labels should be provided when num_class_embeds > 0")
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
# maybe cast it to float16
class_labels = class_labels.cast(self.dtype)
@@ -572,20 +551,19 @@ def forward(
sample += controlnet_cond
# 3. down
- down_block_res_samples = (sample, )
+ down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
- if (hasattr(downsample_block, "has_cross_attention") and
- downsample_block.has_cross_attention):
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
sample, res_samples = downsample_block(
hidden_states=sample,
temb=emb,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
else:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=emb)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
down_block_res_samples += res_samples
@@ -596,16 +574,16 @@ def forward(
emb,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
# 5. Control net blocks
controlnet_down_block_res_samples = ()
- for down_block_res_sample, controlnet_block in zip(
- down_block_res_samples, self.controlnet_down_blocks):
+ for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
down_block_res_sample = controlnet_block(down_block_res_sample)
- controlnet_down_block_res_samples += (down_block_res_sample, )
+ controlnet_down_block_res_samples += (down_block_res_sample,)
down_block_res_samples = controlnet_down_block_res_samples
@@ -613,45 +591,34 @@ def forward(
# 6. scaling
if guess_mode:
- scales = paddle.logspace(
- -1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0
+ scales = paddle.logspace(-1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0
scales *= conditioning_scale
- down_block_res_samples = [
- sample * scale
- for sample, scale in zip(down_block_res_samples, scales)
- ]
+ down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
mid_block_res_sample *= scales[-1] # last one
else:
# add conditioning_scale https://github.com/huggingface/diffusers/pull/2627
if isinstance(conditioning_scale, (float, int)):
- down_block_res_samples = [
- sample * conditioning_scale
- for sample in down_block_res_samples
- ]
+ down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
mid_block_res_sample *= conditioning_scale
else:
down_block_res_samples = [
- sample * ccs
- for sample, ccs in zip(down_block_res_samples,
- conditioning_scale[:-1])
+ sample * ccs for sample, ccs in zip(down_block_res_samples, conditioning_scale[:-1])
]
mid_block_res_sample *= conditioning_scale[-1]
if self.config.global_pool_conditions:
down_block_res_samples = [
- paddle.mean(
- sample, axis=(2, 3), keepdim=True)
- for sample in down_block_res_samples
+ paddle.mean(sample, axis=(2, 3), keepdim=True) for sample in down_block_res_samples
]
- mid_block_res_sample = paddle.mean(
- mid_block_res_sample, axis=(2, 3), keepdim=True)
+ mid_block_res_sample = paddle.mean(mid_block_res_sample, axis=(2, 3), keepdim=True)
if not return_dict:
return (down_block_res_samples, mid_block_res_sample)
return ControlNetOutput(
down_block_res_samples=down_block_res_samples,
- mid_block_res_sample=mid_block_res_sample, )
+ mid_block_res_sample=mid_block_res_sample,
+ )
def zero_module(module):
diff --git a/ppdiffusers/ppdiffusers/models/cross_attention.py b/ppdiffusers/ppdiffusers/models/cross_attention.py
index 06660a99f385d..10911591e9f36 100644
--- a/ppdiffusers/ppdiffusers/models/cross_attention.py
+++ b/ppdiffusers/ppdiffusers/models/cross_attention.py
@@ -15,17 +15,21 @@
from .attention_processor import AttentionProcessor # noqa: F401
from .attention_processor import AttnProcessor2_5 # noqa: F401
from .attention_processor import Attention, AttnAddedKVProcessor
-from .attention_processor import \
- AttnProcessor as AttnProcessorRename # noqa: F401
+from .attention_processor import AttnProcessor as AttnProcessorRename # noqa: F401
from .attention_processor import (
- LoRAAttnProcessor, LoRALinearLayer, LoRAXFormersAttnProcessor,
- SlicedAttnAddedKVProcessor, SlicedAttnProcessor, XFormersAttnProcessor)
+ LoRAAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ SlicedAttnAddedKVProcessor,
+ SlicedAttnProcessor,
+ XFormersAttnProcessor,
+)
deprecate(
"cross_attention",
"0.18.0",
"Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.",
- standard_warn=False, )
+ standard_warn=False,
+)
AttnProcessor = AttentionProcessor
@@ -33,86 +37,54 @@
class CrossAttention(Attention):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class CrossAttnProcessor(AttnProcessorRename):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class LoRACrossAttnProcessor(LoRAAttnProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class CrossAttnAddedKVProcessor(AttnAddedKVProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class XFormersCrossAttnProcessor(XFormersAttnProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class SlicedCrossAttnProcessor(SlicedAttnProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor):
def __init__(self, *args, **kwargs):
deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
- deprecate(
- "cross_attention",
- "0.18.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
super().__init__(*args, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
index d1f6482176d0d..d6f680e81fc62 100644
--- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
@@ -47,35 +47,40 @@ class DualTransformer2DModel(nn.Layer):
"""
def __init__(
- self,
- num_attention_heads: int=16,
- attention_head_dim: int=88,
- in_channels: Optional[int]=None,
- num_layers: int=1,
- dropout: float=0.0,
- norm_num_groups: int=32,
- cross_attention_dim: Optional[int]=None,
- attention_bias: bool=False,
- sample_size: Optional[int]=None,
- num_vector_embeds: Optional[int]=None,
- activation_fn: str="geglu",
- num_embeds_ada_norm: Optional[int]=None, ):
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ ):
super().__init__()
- self.transformers = nn.LayerList([
- Transformer2DModel(
- num_attention_heads=num_attention_heads,
- attention_head_dim=attention_head_dim,
- in_channels=in_channels,
- num_layers=num_layers,
- dropout=dropout,
- norm_num_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
- attention_bias=attention_bias,
- sample_size=sample_size,
- num_vector_embeds=num_vector_embeds,
- activation_fn=activation_fn,
- num_embeds_ada_norm=num_embeds_ada_norm, ) for _ in range(2)
- ])
+ self.transformers = nn.LayerList(
+ [
+ Transformer2DModel(
+ num_attention_heads=num_attention_heads,
+ attention_head_dim=attention_head_dim,
+ in_channels=in_channels,
+ num_layers=num_layers,
+ dropout=dropout,
+ norm_num_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attention_bias=attention_bias,
+ sample_size=sample_size,
+ num_vector_embeds=num_vector_embeds,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ )
+ for _ in range(2)
+ ]
+ )
# Variables that can be set by a pipeline:
@@ -91,13 +96,14 @@ def __init__(
self.transformer_index_for_condition = [1, 0]
def forward(
- self,
- hidden_states,
- encoder_hidden_states,
- timestep=None,
- attention_mask=None,
- cross_attention_kwargs=None,
- return_dict: bool=True, ):
+ self,
+ hidden_states,
+ encoder_hidden_states,
+ timestep=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ return_dict: bool = True,
+ ):
"""
Args:
hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -125,23 +131,22 @@ def forward(
# attention_mask is not used yet
for i in range(2):
# for each of the two transformers, pass the corresponding condition tokens
- condition_state = encoder_hidden_states[:, tokens_start:tokens_start
- + self.condition_lengths[i]]
+ condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
transformer_index = self.transformer_index_for_condition[i]
encoded_state = self.transformers[transformer_index](
input_states,
encoder_hidden_states=condition_state,
timestep=timestep,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
encoded_states.append(encoded_state - input_states)
tokens_start += self.condition_lengths[i]
- output_states = encoded_states[0] * self.mix_ratio + encoded_states[
- 1] * (1 - self.mix_ratio)
+ output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
output_states = output_states + input_states
if not return_dict:
- return (output_states, )
+ return (output_states,)
return Transformer2DModelOutput(sample=output_states)
diff --git a/ppdiffusers/ppdiffusers/models/ema.py b/ppdiffusers/ppdiffusers/models/ema.py
index 1d88a8a18c498..b42e0c2ad02ad 100644
--- a/ppdiffusers/ppdiffusers/models/ema.py
+++ b/ppdiffusers/ppdiffusers/models/ema.py
@@ -34,14 +34,11 @@ def __init__(self, model, decay=0.9999, use_num_upates=True):
raise ValueError("Decay must be between 0 and 1")
self.m_name2s_name = {}
- self.register_buffer(
- "decay", paddle.to_tensor(
- decay, dtype=paddle.float32))
+ self.register_buffer("decay", paddle.to_tensor(decay, dtype=paddle.float32))
self.register_buffer(
"num_updates",
- paddle.to_tensor(
- 0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(
- -1, dtype=paddle.int64), )
+ paddle.to_tensor(0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(-1, dtype=paddle.int64),
+ )
for name, p in model.named_parameters():
if not p.stop_gradient:
@@ -57,8 +54,7 @@ def forward(self, model):
if self.num_updates >= 0:
self.num_updates += 1
- decay = min(self.decay,
- (1 + self.num_updates) / (10 + self.num_updates))
+ decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
one_minus_decay = 1.0 - decay
@@ -79,8 +75,7 @@ def copy_to(self, model):
shadow_params = dict(self.named_buffers())
for key in m_param:
if not m_param[key].stop_gradient:
- m_param[key].copy_(shadow_params[self.m_name2s_name[key]],
- False)
+ m_param[key].copy_(shadow_params[self.m_name2s_name[key]], False)
else:
assert key not in self.m_name2s_name
@@ -91,9 +86,7 @@ def store(self, parameters):
parameters: Iterable of `EagerParamBase`; the parameters to be
temporarily stored.
"""
- self.collected_params = [
- param.detach().cpu().clone() for param in parameters
- ]
+ self.collected_params = [param.detach().cpu().clone() for param in parameters]
def restore(self, parameters):
"""
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index 9527cf3ae055b..4c38ff3d44a98 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -21,12 +21,13 @@
def get_timestep_embedding(
- timesteps: paddle.Tensor,
- embedding_dim: int,
- flip_sin_to_cos: bool=False,
- downscale_freq_shift: float=1,
- scale: float=1,
- max_period: int=10000, ):
+ timesteps: paddle.Tensor,
+ embedding_dim: int,
+ flip_sin_to_cos: bool = False,
+ downscale_freq_shift: float = 1,
+ scale: float = 1,
+ max_period: int = 10000,
+):
"""
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
@@ -38,8 +39,7 @@ def get_timestep_embedding(
assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
half_dim = embedding_dim // 2
- exponent = -math.log(max_period) * paddle.arange(
- start=0, end=half_dim, dtype="float32")
+ exponent = -math.log(max_period) * paddle.arange(start=0, end=half_dim, dtype="float32")
exponent = exponent / (half_dim - downscale_freq_shift)
@@ -62,10 +62,7 @@ def get_timestep_embedding(
return emb
-def get_2d_sincos_pos_embed(embed_dim,
- grid_size,
- cls_token=False,
- extra_tokens=0):
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
"""
grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
@@ -78,8 +75,7 @@ def get_2d_sincos_pos_embed(embed_dim,
grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token and extra_tokens > 0:
- pos_embed = np.concatenate(
- [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
return pos_embed
@@ -88,10 +84,8 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
raise ValueError("embed_dim must be divisible by 2")
# use half of dimensions to encode grid_h
- emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2,
- grid[0]) # (H*W, D/2)
- emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2,
- grid[1]) # (H*W, D/2)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
@@ -122,16 +116,17 @@ class PatchEmbed(nn.Layer):
"""2D Image to Patch Embedding"""
def __init__(
- self,
- height=224,
- width=224,
- patch_size=16,
- in_channels=3,
- embed_dim=768,
- layer_norm=False,
- flatten=True,
- bias=True,
- add_pos_embed=True, ):
+ self,
+ height=224,
+ width=224,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ add_pos_embed=True,
+ ):
super().__init__()
num_patches = (height // patch_size) * (width // patch_size)
@@ -143,22 +138,22 @@ def __init__(
embed_dim,
kernel_size=(patch_size, patch_size),
stride=patch_size,
- bias_attr=bias, )
+ bias_attr=bias,
+ )
if layer_norm:
# elementwise_affine=False -> weight_attr=False, bias_attr=False
- self.norm = nn.LayerNorm(
- embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False)
+ self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False)
else:
self.norm = None
self.add_pos_embed = add_pos_embed
if add_pos_embed:
- pos_embed = get_2d_sincos_pos_embed(embed_dim,
- int(num_patches**0.5))
+ pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
self.register_buffer(
"pos_embed",
paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0),
- persistable=False, )
+ persistable=False,
+ )
def forward(self, latent):
latent = self.proj(latent)
@@ -174,20 +169,20 @@ def forward(self, latent):
class TimestepEmbedding(nn.Layer):
def __init__(
- self,
- in_channels: int,
- time_embed_dim: int,
- act_fn: str="silu",
- out_dim: int=None,
- post_act_fn: Optional[str]=None,
- cond_proj_dim=None, ):
+ self,
+ in_channels: int,
+ time_embed_dim: int,
+ act_fn: str = "silu",
+ out_dim: int = None,
+ post_act_fn: Optional[str] = None,
+ cond_proj_dim=None,
+ ):
super().__init__()
self.linear_1 = nn.Linear(in_channels, time_embed_dim)
if cond_proj_dim is not None:
- self.cond_proj = nn.Linear(
- cond_proj_dim, in_channels, bias_attr=False)
+ self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias_attr=False)
else:
self.cond_proj = None
@@ -198,9 +193,7 @@ def __init__(
elif act_fn == "gelu":
self.act = nn.GELU()
else:
- raise ValueError(
- f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'"
- )
+ raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
if out_dim is not None:
time_embed_dim_out = out_dim
@@ -217,9 +210,7 @@ def __init__(
elif post_act_fn == "gelu":
self.post_act = nn.GELU()
else:
- raise ValueError(
- f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'"
- )
+ raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
def forward(self, sample, condition=None):
if condition is not None:
@@ -237,10 +228,7 @@ def forward(self, sample, condition=None):
class Timesteps(nn.Layer):
- def __init__(self,
- num_channels: int,
- flip_sin_to_cos: bool,
- downscale_freq_shift: float):
+ def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
super().__init__()
self.num_channels = num_channels
self.flip_sin_to_cos = flip_sin_to_cos
@@ -251,7 +239,8 @@ def forward(self, timesteps):
timesteps,
self.num_channels,
flip_sin_to_cos=self.flip_sin_to_cos,
- downscale_freq_shift=self.downscale_freq_shift, )
+ downscale_freq_shift=self.downscale_freq_shift,
+ )
return t_emb
@@ -259,20 +248,21 @@ class GaussianFourierProjection(nn.Layer):
"""Gaussian Fourier embeddings for noise levels."""
def __init__(
- self,
- embedding_size: int=256,
- scale: float=1.0,
- set_W_to_weight=True,
- log=True,
- flip_sin_to_cos=False, ):
+ self,
+ embedding_size: int = 256,
+ scale: float = 1.0,
+ set_W_to_weight=True,
+ log=True,
+ flip_sin_to_cos=False,
+ ):
super().__init__()
- self.register_buffer("weight", paddle.randn((embedding_size, )) * scale)
+ self.register_buffer("weight", paddle.randn((embedding_size,)) * scale)
self.log = log
self.flip_sin_to_cos = flip_sin_to_cos
if set_W_to_weight:
# to delete later
- self.register_buffer("W", paddle.randn((embedding_size, )) * scale)
+ self.register_buffer("W", paddle.randn((embedding_size,)) * scale)
self.weight = self.W
@@ -285,11 +275,9 @@ def forward(self, x):
x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
if self.flip_sin_to_cos:
- out = paddle.concat(
- [paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1)
+ out = paddle.concat([paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1)
else:
- out = paddle.concat(
- [paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1)
+ out = paddle.concat([paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1)
return out
@@ -318,11 +306,12 @@ class ImagePositionalEmbeddings(nn.Layer):
"""
def __init__(
- self,
- num_embed: int,
- height: int,
- width: int,
- embed_dim: int, ):
+ self,
+ num_embed: int,
+ height: int,
+ width: int,
+ embed_dim: int,
+ ):
super().__init__()
self.height = height
@@ -337,14 +326,12 @@ def __init__(
def forward(self, index):
emb = self.emb(index)
- height_emb = self.height_emb(
- paddle.arange(self.height).reshape([1, self.height]))
+ height_emb = self.height_emb(paddle.arange(self.height).reshape([1, self.height]))
# 1 x H x D -> 1 x H x 1 x D
height_emb = height_emb.unsqueeze(2)
- width_emb = self.width_emb(
- paddle.arange(self.width).reshape([1, self.width]))
+ width_emb = self.width_emb(paddle.arange(self.width).reshape([1, self.width]))
# 1 x W x D -> 1 x 1 x W x D
width_emb = width_emb.unsqueeze(1)
@@ -354,7 +341,7 @@ def forward(self, index):
# 1 x H x W x D -> 1 x L xD
pos_emb = pos_emb.reshape([1, self.height * self.width, -1])
- emb = emb + pos_emb[:, :emb.shape[1], :]
+ emb = emb + pos_emb[:, : emb.shape[1], :]
return emb
@@ -372,8 +359,7 @@ class LabelEmbedding(nn.Layer):
def __init__(self, num_classes, hidden_size, dropout_prob):
super().__init__()
use_cfg_embedding = dropout_prob > 0
- self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding,
- hidden_size)
+ self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
self.num_classes = num_classes
self.dropout_prob = dropout_prob
@@ -382,7 +368,12 @@ def token_drop(self, labels, force_drop_ids=None):
Drops labels to enable classifier-free guidance.
"""
if force_drop_ids is None:
- drop_ids = (paddle.rand((labels.shape[0], ), ) < self.dropout_prob)
+ drop_ids = (
+ paddle.rand(
+ (labels.shape[0],),
+ )
+ < self.dropout_prob
+ )
else:
drop_ids = paddle.to_tensor(force_drop_ids == 1)
labels = paddle.where(drop_ids, self.num_classes, labels)
@@ -400,17 +391,13 @@ class CombinedTimestepLabelEmbeddings(nn.Layer):
def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
super().__init__()
- self.time_proj = Timesteps(
- num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
- self.timestep_embedder = TimestepEmbedding(
- in_channels=256, time_embed_dim=embedding_dim)
- self.class_embedder = LabelEmbedding(num_classes, embedding_dim,
- class_dropout_prob)
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+ self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
def forward(self, timestep, class_labels, hidden_dtype=None):
timesteps_proj = self.time_proj(timestep)
- timesteps_emb = self.timestep_embedder(
- timesteps_proj.cast(hidden_dtype)) # (N, D)
+ timesteps_emb = self.timestep_embedder(timesteps_proj.cast(hidden_dtype)) # (N, D)
class_labels = self.class_embedder(class_labels) # (N, D)
@@ -420,8 +407,7 @@ def forward(self, timestep, class_labels, hidden_dtype=None):
class TextTimeEmbedding(nn.Layer):
- def __init__(self, encoder_dim: int, time_embed_dim: int,
- num_heads: int=64):
+ def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
super().__init__()
self.norm1 = nn.LayerNorm(encoder_dim)
self.pool = AttentionPooling(num_heads, encoder_dim)
@@ -443,8 +429,8 @@ def __init__(self, num_heads, embed_dim, dtype=None):
super().__init__()
self.positional_embedding = self.create_parameter(
(1, embed_dim),
- default_initializer=nn.initializer.Assign(
- paddle.randn((1, embed_dim)) / embed_dim**0.5), )
+ default_initializer=nn.initializer.Assign(paddle.randn((1, embed_dim)) / embed_dim**0.5),
+ )
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
@@ -466,8 +452,7 @@ def shape(x):
x = x.transpose([0, 2, 1])
return x
- class_token = x.mean(
- axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype)
+ class_token = x.mean(axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype)
x = paddle.concat([class_token, x], axis=1) # (bs, length+1, width)
# (bs*n_heads, class_token_length, dim_per_head)
@@ -478,10 +463,9 @@ def shape(x):
# (bs*n_heads, class_token_length, length+class_token_length):
weight = paddle.einsum(
- "bct,bcs->bts", q * self.scale,
- k * self.scale) # More stable with f16 than dividing afterwards
- weight = nn.functional.softmax(
- weight.cast("float32"), axis=-1).cast(weight.dtype)
+ "bct,bcs->bts", q * self.scale, k * self.scale
+ ) # More stable with f16 than dividing afterwards
+ weight = nn.functional.softmax(weight.cast("float32"), axis=-1).cast(weight.dtype)
# (bs*n_heads, dim_per_head, class_token_length)
a = paddle.einsum("bts,bcs->bct", weight, v)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
index 192173d39afdf..d3a3befd29063 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
@@ -39,8 +39,9 @@ def hinge_d_loss(logits_real, logits_fake):
def vanilla_d_loss(logits_real, logits_fake):
d_loss = 0.5 * (
- paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real)) +
- paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake)))
+ paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real))
+ + paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake))
+ )
return d_loss
@@ -52,41 +53,34 @@ def Normalize(in_channels, norm_type="group"):
num_channels=in_channels,
epsilon=1e-06,
weight_attr=None,
- bias_attr=None, )
+ bias_attr=None,
+ )
elif norm_type == "batch":
return paddle.nn.SyncBatchNorm(in_channels)
class ResBlock(paddle.nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels=None,
- conv_shortcut=False,
- dropout=0.0,
- norm_type="group",
- padding_type="replicate", ):
+ self,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout=0.0,
+ norm_type="group",
+ padding_type="replicate",
+ ):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
self.out_channels = out_channels
self.use_conv_shortcut = conv_shortcut
self.norm1 = Normalize(in_channels, norm_type)
- self.conv1 = SamePadConv3d(
- in_channels, out_channels, kernel_size=3, padding_type=padding_type)
+ self.conv1 = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
self.dropout = paddle.nn.Dropout(p=dropout)
self.norm2 = Normalize(in_channels, norm_type)
- self.conv2 = SamePadConv3d(
- out_channels,
- out_channels,
- kernel_size=3,
- padding_type=padding_type)
+ self.conv2 = SamePadConv3d(out_channels, out_channels, kernel_size=3, padding_type=padding_type)
if self.in_channels != self.out_channels:
- self.conv_shortcut = SamePadConv3d(
- in_channels,
- out_channels,
- kernel_size=3,
- padding_type=padding_type)
+ self.conv_shortcut = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
def forward(self, x):
h = x
@@ -103,18 +97,19 @@ def forward(self, x):
class SamePadConv3d(paddle.nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size,
- stride=1,
- bias=True,
- padding_type="replicate", ):
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ bias=True,
+ padding_type="replicate",
+ ):
super().__init__()
if isinstance(kernel_size, int):
- kernel_size = (kernel_size, ) * 3
+ kernel_size = (kernel_size,) * 3
if isinstance(stride, int):
- stride = (stride, ) * 3
+ stride = (stride,) * 3
total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)])
pad_input = []
for p in total_pad[::-1]:
@@ -128,31 +123,31 @@ def __init__(
kernel_size=kernel_size,
stride=stride,
padding=0,
- bias_attr=bias, )
+ bias_attr=bias,
+ )
self.weight = self.conv.weight
def forward(self, x):
return self.conv(
- paddle.nn.functional.pad(x=x,
- pad=self.pad_input,
- mode=self.padding_type,
- data_format="NCDHW"))
+ paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW")
+ )
class SamePadConvTranspose3d(paddle.nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size,
- stride=1,
- bias=True,
- padding_type="replicate", ):
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ bias=True,
+ padding_type="replicate",
+ ):
super().__init__()
if isinstance(kernel_size, int):
- kernel_size = (kernel_size, ) * 3
+ kernel_size = (kernel_size,) * 3
if isinstance(stride, int):
- stride = (stride, ) * 3
+ stride = (stride,) * 3
total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)])
pad_input = []
for p in total_pad[::-1]:
@@ -166,45 +161,38 @@ def __init__(
kernel_size=kernel_size,
stride=stride,
padding=tuple([(k - 1) for k in kernel_size]),
- bias_attr=bias, )
+ bias_attr=bias,
+ )
def forward(self, x):
return self.convt(
- paddle.nn.functional.pad(x=x,
- pad=self.pad_input,
- mode=self.padding_type,
- data_format="NCDHW"))
+ paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW")
+ )
class Encoder(paddle.nn.Layer):
def __init__(
- self,
- n_hiddens,
- downsample,
- z_channels,
- double_z,
- image_channel=3,
- norm_type="group",
- padding_type="replicate", ):
+ self,
+ n_hiddens,
+ downsample,
+ z_channels,
+ double_z,
+ image_channel=3,
+ norm_type="group",
+ padding_type="replicate",
+ ):
super().__init__()
n_times_downsample = np.array([int(math.log2(d)) for d in downsample])
self.conv_blocks = paddle.nn.LayerList()
max_ds = n_times_downsample.max()
- self.conv_first = SamePadConv3d(
- image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
+ self.conv_first = SamePadConv3d(image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
for i in range(max_ds):
block = paddle.nn.Layer()
in_channels = n_hiddens * 2**i
- out_channels = n_hiddens * 2**(i + 1)
+ out_channels = n_hiddens * 2 ** (i + 1)
stride = tuple([(2 if d > 0 else 1) for d in n_times_downsample])
- block.down = SamePadConv3d(
- in_channels,
- out_channels,
- 4,
- stride=stride,
- padding_type=padding_type)
- block.res = ResBlock(
- out_channels, out_channels, norm_type=norm_type)
+ block.down = SamePadConv3d(in_channels, out_channels, 4, stride=stride, padding_type=padding_type)
+ block.res = ResBlock(out_channels, out_channels, norm_type=norm_type)
self.conv_blocks.append(block)
n_times_downsample -= 1
self.final_block = paddle.nn.Sequential(
@@ -215,7 +203,9 @@ def __init__(
2 * z_channels if double_z else z_channels,
kernel_size=3,
stride=1,
- padding_type=padding_type, ), )
+ padding_type=padding_type,
+ ),
+ )
self.out_channels = out_channels
def forward(self, x):
@@ -228,12 +218,7 @@ def forward(self, x):
class Decoder(paddle.nn.Layer):
- def __init__(self,
- n_hiddens,
- upsample,
- z_channels,
- image_channel,
- norm_type="group"):
+ def __init__(self, n_hiddens, upsample, z_channels, image_channel, norm_type="group"):
super().__init__()
n_times_upsample = np.array([int(math.log2(d)) for d in upsample])
max_us = n_times_upsample.max()
@@ -241,20 +226,15 @@ def __init__(self,
self.conv_blocks = paddle.nn.LayerList()
for i in range(max_us):
block = paddle.nn.Layer()
- in_channels = in_channels if i == 0 else n_hiddens * 2**(
- max_us - i + 1)
- out_channels = n_hiddens * 2**(max_us - i)
+ in_channels = in_channels if i == 0 else n_hiddens * 2 ** (max_us - i + 1)
+ out_channels = n_hiddens * 2 ** (max_us - i)
us = tuple([(2 if d > 0 else 1) for d in n_times_upsample])
- block.up = SamePadConvTranspose3d(
- in_channels, out_channels, 4, stride=us)
- block.res1 = ResBlock(
- out_channels, out_channels, norm_type=norm_type)
- block.res2 = ResBlock(
- out_channels, out_channels, norm_type=norm_type)
+ block.up = SamePadConvTranspose3d(in_channels, out_channels, 4, stride=us)
+ block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+ block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type)
self.conv_blocks.append(block)
n_times_upsample -= 1
- self.conv_out = SamePadConv3d(
- out_channels, image_channel, kernel_size=3)
+ self.conv_out = SamePadConv3d(out_channels, image_channel, kernel_size=3)
def forward(self, x):
h = x
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
index 7a934de7f6224..acc73c41c8fdd 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
@@ -17,8 +17,9 @@
from paddle.distributed.fleet.utils import recompute
try:
- from paddle.incubate.nn.memory_efficient_attention import \
- memory_efficient_attention # noqa
+ from paddle.incubate.nn.memory_efficient_attention import ( # noqa
+ memory_efficient_attention,
+ )
_ppxformers_available = True
except:
@@ -30,8 +31,15 @@
from einops import rearrange, repeat
from ..utils.initializer_utils import constant_, xavier_uniform_
-from .lvdm_util import (GEGLU, Normalize, conv_nd, default, exists,
- normalization, zero_module)
+from .lvdm_util import (
+ GEGLU,
+ Normalize,
+ conv_nd,
+ default,
+ exists,
+ normalization,
+ zero_module,
+)
def finfo(dtype):
@@ -53,15 +61,19 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
super().__init__()
inner_dim = int(dim * mult)
dim_out = default(dim_out, dim)
- project_in = (paddle.nn.Sequential(
- paddle.nn.Linear(
- in_features=dim, out_features=inner_dim),
- paddle.nn.GELU(), ) if not glu else GEGLU(dim, inner_dim))
+ project_in = (
+ paddle.nn.Sequential(
+ paddle.nn.Linear(in_features=dim, out_features=inner_dim),
+ paddle.nn.GELU(),
+ )
+ if not glu
+ else GEGLU(dim, inner_dim)
+ )
self.net = paddle.nn.Sequential(
project_in,
paddle.nn.Dropout(p=dropout),
- paddle.nn.Linear(
- in_features=inner_dim, out_features=dim_out), )
+ paddle.nn.Linear(in_features=inner_dim, out_features=dim_out),
+ )
def forward(self, x):
return self.net(x)
@@ -74,19 +86,19 @@ def __init__(self, num_units, max_relative_position):
super().__init__()
self.num_units = num_units
self.max_relative_position = max_relative_position
- self.embeddings_table = paddle.nn.Parameter(
- paddle.empty(shape=[max_relative_position * 2 + 1, num_units]))
+ self.embeddings_table = paddle.nn.Parameter(paddle.empty(shape=[max_relative_position * 2 + 1, num_units]))
xavier_uniform_(self.embeddings_table)
def forward(self, length_q, length_k):
- device = self.embeddings_table.place
+ # device = self.embeddings_table.place
range_vec_q = paddle.arange(end=length_q)
range_vec_k = paddle.arange(end=length_k)
distance_mat = range_vec_k[(None), :] - range_vec_q[:, (None)]
distance_mat_clipped = paddle.clip(
x=distance_mat,
min=-self.max_relative_position,
- max=self.max_relative_position, )
+ max=self.max_relative_position,
+ )
final_mat = distance_mat_clipped + self.max_relative_position
final_mat = final_mat.astype(dtype="int64")
embeddings = self.embeddings_table[final_mat]
@@ -95,15 +107,16 @@ def forward(self, length_q, length_k):
class TemporalCrossAttention(paddle.nn.Layer):
def __init__(
- self,
- query_dim,
- context_dim=None,
- heads=8,
- dim_head=64,
- dropout=0.0,
- use_relative_position=False,
- temporal_length=None,
- **kwargs, ):
+ self,
+ query_dim,
+ context_dim=None,
+ heads=8,
+ dim_head=64,
+ dropout=0.0,
+ use_relative_position=False,
+ temporal_length=None,
+ **kwargs,
+ ):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, query_dim)
@@ -112,22 +125,17 @@ def __init__(
self.heads = heads
self.temporal_length = temporal_length
self.use_relative_position = use_relative_position
- self.to_q = paddle.nn.Linear(
- in_features=query_dim, out_features=inner_dim, bias_attr=False)
- self.to_k = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
- self.to_v = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+ self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
self.to_out = paddle.nn.Sequential(
- paddle.nn.Linear(
- in_features=inner_dim, out_features=query_dim),
- paddle.nn.Dropout(p=dropout), )
+ paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+ paddle.nn.Dropout(p=dropout),
+ )
if use_relative_position:
assert temporal_length is not None
- self.relative_position_k = RelativePosition(
- num_units=dim_head, max_relative_position=temporal_length)
- self.relative_position_v = RelativePosition(
- num_units=dim_head, max_relative_position=temporal_length)
+ self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+ self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
constant_(self.to_q.weight, 0)
constant_(self.to_k.weight, 0)
constant_(self.to_v.weight, 0)
@@ -162,32 +170,23 @@ def forward(self, x, context=None, mask=None):
class CrossAttention(paddle.nn.Layer):
- def __init__(self,
- query_dim,
- context_dim=None,
- heads=8,
- dim_head=64,
- dropout=0.0,
- **kwargs):
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, query_dim)
self.scale = dim_head**-0.5
self.heads = heads
- self.to_q = paddle.nn.Linear(
- in_features=query_dim, out_features=inner_dim, bias_attr=False)
- self.to_k = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
- self.to_v = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+ self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
self.to_out = paddle.nn.Sequential(
- paddle.nn.Linear(
- in_features=inner_dim, out_features=query_dim),
- paddle.nn.Dropout(p=dropout), )
+ paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+ paddle.nn.Dropout(p=dropout),
+ )
def forward(self, x, context=None, mask=None):
h = self.heads
- b = x.shape[0]
+ # b = x.shape[0]
q = self.to_q(x)
context = default(context, x)
k = self.to_k(context)
@@ -206,13 +205,7 @@ def forward(self, x, context=None, mask=None):
class MemoryEfficientCrossAttention(paddle.nn.Layer):
- def __init__(self,
- query_dim,
- context_dim=None,
- heads=8,
- dim_head=64,
- dropout=0.0,
- **kwargs):
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
super().__init__()
print(
f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using {heads} heads."
@@ -221,16 +214,13 @@ def __init__(self,
context_dim = default(context_dim, query_dim)
self.heads = heads
self.dim_head = dim_head
- self.to_q = paddle.nn.Linear(
- in_features=query_dim, out_features=inner_dim, bias_attr=False)
- self.to_k = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
- self.to_v = paddle.nn.Linear(
- in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+ self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+ self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
self.to_out = paddle.nn.Sequential(
- paddle.nn.Linear(
- in_features=inner_dim, out_features=query_dim),
- paddle.nn.Dropout(p=dropout), )
+ paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+ paddle.nn.Dropout(p=dropout),
+ )
self.attention_op = "cutlass"
def forward(self, x, context=None, mask=None):
@@ -239,8 +229,7 @@ def forward(self, x, context=None, mask=None):
k = self.to_k(context)
v = self.to_v(context)
b, _, _ = q.shape
- q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]),
- (q, k, v))
+ q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]), (q, k, v))
out = F.scaled_dot_product_attention_(
q,
k,
@@ -248,7 +237,8 @@ def forward(self, x, context=None, mask=None):
attn_mask=None,
dropout_p=0.0,
attention_op=self.attention_op,
- training=True, )
+ training=True,
+ )
if exists(mask):
raise NotImplementedError
out = out.reshape([0, 0, self.heads * self.dim_head])
@@ -261,63 +251,46 @@ class BasicTransformerBlockST(paddle.nn.Layer):
"""
def __init__(
- self,
- dim,
- n_heads,
- d_head,
- dropout=0.0,
- context_dim=None,
- gated_ff=True,
- checkpoint=True,
- temporal_length=None,
- use_relative_position=True,
- **kwargs, ):
+ self,
+ dim,
+ n_heads,
+ d_head,
+ dropout=0.0,
+ context_dim=None,
+ gated_ff=True,
+ checkpoint=True,
+ temporal_length=None,
+ use_relative_position=True,
+ **kwargs,
+ ):
super().__init__()
if _ppxformers_available:
self.attn1 = MemoryEfficientCrossAttention(
- query_dim=dim,
- heads=n_heads,
- dim_head=d_head,
- dropout=dropout,
- **kwargs)
+ query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs
+ )
self.attn2 = MemoryEfficientCrossAttention(
query_dim=dim,
context_dim=context_dim,
heads=n_heads,
dim_head=d_head,
dropout=dropout,
- **kwargs, )
+ **kwargs,
+ )
else:
- self.attn1 = CrossAttention(
- query_dim=dim,
- heads=n_heads,
- dim_head=d_head,
- dropout=dropout,
- **kwargs)
+ self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs)
self.attn2 = CrossAttention(
query_dim=dim,
context_dim=context_dim,
heads=n_heads,
dim_head=d_head,
dropout=dropout,
- **kwargs, )
+ **kwargs,
+ )
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
- self.norm1 = paddle.nn.LayerNorm(
- normalized_shape=dim,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None)
- self.norm2 = paddle.nn.LayerNorm(
- normalized_shape=dim,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None)
- self.norm3 = paddle.nn.LayerNorm(
- normalized_shape=dim,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None)
+ self.norm1 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+ self.norm2 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+ self.norm3 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
self.checkpoint = checkpoint
self.attn1_tmp = TemporalCrossAttention(
query_dim=dim,
@@ -326,7 +299,8 @@ def __init__(
dropout=dropout,
temporal_length=temporal_length,
use_relative_position=use_relative_position,
- **kwargs, )
+ **kwargs,
+ )
self.attn2_tmp = TemporalCrossAttention(
query_dim=dim,
heads=n_heads,
@@ -335,17 +309,10 @@ def __init__(
context_dim=None,
temporal_length=temporal_length,
use_relative_position=use_relative_position,
- **kwargs, )
- self.norm4 = paddle.nn.LayerNorm(
- normalized_shape=dim,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None)
- self.norm5 = paddle.nn.LayerNorm(
- normalized_shape=dim,
- epsilon=1e-05,
- weight_attr=None,
- bias_attr=None)
+ **kwargs,
+ )
+ self.norm4 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+ self.norm5 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
def forward(self, x, context=None, **kwargs):
if self.checkpoint:
@@ -366,8 +333,7 @@ def _forward(self, x, context=None, mask=None):
if context is not None:
context_ = []
for i in range(context.shape[0]):
- context_.append(context[i].unsqueeze(axis=0).tile(
- repeat_times=[t, 1, 1]))
+ context_.append(context[i].unsqueeze(axis=0).tile(repeat_times=[t, 1, 1]))
context_ = paddle.concat(x=context_, axis=0)
else:
context_ = None
@@ -389,16 +355,17 @@ class SpatialTemporalTransformer(paddle.nn.Layer):
"""
def __init__(
- self,
- in_channels,
- n_heads,
- d_head,
- depth=1,
- dropout=0.0,
- context_dim=None,
- temporal_length=None,
- use_relative_position=True,
- **kwargs, ):
+ self,
+ in_channels,
+ n_heads,
+ d_head,
+ depth=1,
+ dropout=0.0,
+ context_dim=None,
+ temporal_length=None,
+ use_relative_position=True,
+ **kwargs,
+ ):
super().__init__()
self.in_channels = in_channels
inner_dim = n_heads * d_head
@@ -408,25 +375,32 @@ def __init__(
out_channels=inner_dim,
kernel_size=1,
stride=1,
- padding=0, )
- self.transformer_blocks = paddle.nn.LayerList(sublayers=[
- BasicTransformerBlockST(
- inner_dim,
- n_heads,
- d_head,
- dropout=dropout,
- context_dim=context_dim,
- temporal_length=temporal_length,
- use_relative_position=use_relative_position,
- **kwargs, ) for d in range(depth)
- ])
+ padding=0,
+ )
+ self.transformer_blocks = paddle.nn.LayerList(
+ sublayers=[
+ BasicTransformerBlockST(
+ inner_dim,
+ n_heads,
+ d_head,
+ dropout=dropout,
+ context_dim=context_dim,
+ temporal_length=temporal_length,
+ use_relative_position=use_relative_position,
+ **kwargs,
+ )
+ for d in range(depth)
+ ]
+ )
self.proj_out = zero_module(
paddle.nn.Conv3D(
in_channels=inner_dim,
out_channels=in_channels,
kernel_size=1,
stride=1,
- padding=0, ))
+ padding=0,
+ )
+ )
def forward(self, x, context=None, **kwargs):
assert x.dim() == 5, f"x shape = {x.shape}"
@@ -441,13 +415,14 @@ def forward(self, x, context=None, **kwargs):
class STAttentionBlock(paddle.nn.Layer):
def __init__(
- self,
- channels,
- num_heads=1,
- num_head_channels=-1,
- use_checkpoint=False,
- temporal_length=16,
- use_relative_position=False, ):
+ self,
+ channels,
+ num_heads=1,
+ num_head_channels=-1,
+ use_checkpoint=False,
+ temporal_length=16,
+ use_relative_position=False,
+ ):
super().__init__()
if num_head_channels == -1:
self.num_heads = num_heads
@@ -468,10 +443,12 @@ def __init__(
if use_relative_position:
self.relative_position_k = RelativePosition(
num_units=channels // self.num_heads,
- max_relative_position=temporal_length, )
+ max_relative_position=temporal_length,
+ )
self.relative_position_v = RelativePosition(
num_units=channels // self.num_heads,
- max_relative_position=temporal_length, )
+ max_relative_position=temporal_length,
+ )
self.proj_out_s = zero_module(conv_nd(1, channels, channels, 1))
self.proj_out_t = zero_module(conv_nd(1, channels, channels, 1))
@@ -512,22 +489,21 @@ def forward(self, qkv, rp=None, mask=None):
weight = paddle.einsum(
"bct,bcs->bts",
(q * scale).reshape([bs * self.n_heads, ch, length]),
- (k * scale).reshape([bs * self.n_heads, ch, length]), )
+ (k * scale).reshape([bs * self.n_heads, ch, length]),
+ )
if rp is not None:
k_rp, v_rp = rp
weight2 = paddle.einsum(
"bct,tsc->bst",
(q * scale).reshape([bs * self.n_heads, ch, length]),
- k_rp, )
+ k_rp,
+ )
weight += weight2
if mask is not None:
INF = -100000000.0
- weight = paddle.where(
- mask == 0, weight.astype(dtype="float32"), INF)
- weight = paddle.nn.functional.softmax(
- x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype)
- a = paddle.einsum("bts,bcs->bct", weight,
- v.reshape([bs * self.n_heads, ch, length]))
+ weight = paddle.where(mask == 0, weight.astype(dtype="float32"), INF)
+ weight = paddle.nn.functional.softmax(x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype)
+ a = paddle.einsum("bts,bcs->bct", weight, v.reshape([bs * self.n_heads, ch, length]))
if rp is not None:
x = paddle.einsum("bts,tsc->btc", weight, v_rp)
perm_3 = list(range(x.ndim))
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
index e2b9a88f4c4e5..a66cf086f85d7 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
@@ -58,21 +58,26 @@ def kl(self, other=None):
elif other is None:
return 0.5 * paddle.sum(
x=paddle.pow(x=self.mean, y=2) + self.var - 1.0 - self.logvar,
- axis=[1, 2, 3], )
+ axis=[1, 2, 3],
+ )
else:
return 0.5 * paddle.sum(
- x=paddle.pow(x=self.mean - other.mean, y=2) / other.var +
- self.var / other.var - 1.0 - self.logvar + other.logvar,
- axis=[1, 2, 3], )
+ x=paddle.pow(x=self.mean - other.mean, y=2) / other.var
+ + self.var / other.var
+ - 1.0
+ - self.logvar
+ + other.logvar,
+ axis=[1, 2, 3],
+ )
def nll(self, sample, dims=[1, 2, 3]):
if self.deterministic:
return paddle.to_tensor(data=[0.0], dtype="float32")
logtwopi = np.log(2.0 * np.pi)
return 0.5 * paddle.sum(
- x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) /
- self.var,
- axis=dims, )
+ x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) / self.var,
+ axis=dims,
+ )
def mode(self):
return self.mean
@@ -91,11 +96,11 @@ def normal_kl(mean1, logvar1, mean2, logvar2):
tensor = obj
break
assert tensor is not None, "at least one argument must be a Tensor"
- logvar1, logvar2 = [
- (x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x))
- for x in (logvar1, logvar2)
- ]
+ logvar1, logvar2 = [(x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x)) for x in (logvar1, logvar2)]
return 0.5 * (
- -1.0 + logvar2 - logvar1 + paddle.exp(x=(logvar1 - logvar2
- ).astype("float32")) +
- (mean1 - mean2)**2 * paddle.exp(x=(-logvar2).astype("float32")))
+ -1.0
+ + logvar2
+ - logvar1
+ + paddle.exp(x=(logvar1 - logvar2).astype("float32"))
+ + (mean1 - mean2) ** 2 * paddle.exp(x=(-logvar2).astype("float32"))
+ )
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
index a48a260f655dd..512431be11300 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
@@ -21,10 +21,16 @@
from ..configuration_utils import ConfigMixin, register_to_config
from ..utils import BaseOutput
-from .lvdm_attention_temporal import (SpatialTemporalTransformer,
- STAttentionBlock)
-from .lvdm_util import (avg_pool_nd, conv_nd, linear, nonlinearity,
- normalization, timestep_embedding, zero_module)
+from .lvdm_attention_temporal import SpatialTemporalTransformer, STAttentionBlock
+from .lvdm_util import (
+ avg_pool_nd,
+ conv_nd,
+ linear,
+ nonlinearity,
+ normalization,
+ timestep_embedding,
+ zero_module,
+)
from .modeling_utils import ModelMixin
@@ -87,13 +93,14 @@ class Upsample(paddle.nn.Layer):
"""
def __init__(
- self,
- channels,
- use_conv,
- dims=2,
- out_channels=None,
- kernel_size_t=3,
- padding_t=1, ):
+ self,
+ channels,
+ use_conv,
+ dims=2,
+ out_channels=None,
+ kernel_size_t=3,
+ padding_t=1,
+ ):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -105,7 +112,8 @@ def __init__(
self.channels,
self.out_channels,
(kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), )
+ padding=(padding_t, 1, 1),
+ )
def forward(self, x):
assert x.shape[1] == self.channels
@@ -114,10 +122,10 @@ def forward(self, x):
x=x,
size=(x.shape[2], x.shape[3] * 2, x.shape[4] * 2),
mode="nearest",
- data_format="NCDHW", )
+ data_format="NCDHW",
+ )
else:
- x = paddle.nn.functional.interpolate(
- x=x, scale_factor=2, mode="nearest")
+ x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="nearest")
if self.use_conv:
x = self.conv(x)
return x
@@ -133,13 +141,14 @@ class Downsample(paddle.nn.Layer):
"""
def __init__(
- self,
- channels,
- use_conv,
- dims=2,
- out_channels=None,
- kernel_size_t=3,
- padding_t=1, ):
+ self,
+ channels,
+ use_conv,
+ dims=2,
+ out_channels=None,
+ kernel_size_t=3,
+ padding_t=1,
+ ):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -153,7 +162,8 @@ def __init__(
self.out_channels,
(kernel_size_t, 3, 3),
stride=stride,
- padding=(padding_t, 1, 1), )
+ padding=(padding_t, 1, 1),
+ )
else:
assert self.channels == self.out_channels
self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
@@ -179,21 +189,23 @@ class ResBlock(TimestepBlock):
:param down: if True, use this block for downsampling.
"""
- def __init__(self,
- channels,
- emb_channels,
- dropout,
- out_channels=None,
- use_conv=False,
- use_scale_shift_norm=False,
- dims=2,
- use_checkpoint=False,
- up=False,
- down=False,
- kernel_size_t=3,
- padding_t=1,
- nonlinearity_type="silu",
- **kwargs):
+ def __init__(
+ self,
+ channels,
+ emb_channels,
+ dropout,
+ out_channels=None,
+ use_conv=False,
+ use_scale_shift_norm=False,
+ dims=2,
+ use_checkpoint=False,
+ up=False,
+ down=False,
+ kernel_size_t=3,
+ padding_t=1,
+ nonlinearity_type="silu",
+ **kwargs
+ ):
super().__init__()
self.channels = channels
self.emb_channels = emb_channels
@@ -211,42 +223,25 @@ def __init__(self,
channels,
self.out_channels,
(kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), ), )
+ padding=(padding_t, 1, 1),
+ ),
+ )
self.updown = up or down
if up:
- self.h_upd = Upsample(
- channels,
- False,
- dims,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t)
- self.x_upd = Upsample(
- channels,
- False,
- dims,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t)
+ self.h_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+ self.x_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
elif down:
- self.h_upd = Downsample(
- channels,
- False,
- dims,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t)
- self.x_upd = Downsample(
- channels,
- False,
- dims,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t)
+ self.h_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+ self.x_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
else:
self.h_upd = self.x_upd = paddle.nn.Identity()
self.emb_layers = paddle.nn.Sequential(
nonlinearity(nonlinearity_type),
linear(
emb_channels,
- 2 * self.out_channels
- if use_scale_shift_norm else self.out_channels, ), )
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+ ),
+ )
self.out_layers = paddle.nn.Sequential(
normalization(self.out_channels),
nonlinearity(nonlinearity_type),
@@ -257,7 +252,10 @@ def __init__(self,
self.out_channels,
self.out_channels,
(kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), )), )
+ padding=(padding_t, 1, 1),
+ )
+ ),
+ )
if self.out_channels == channels:
self.skip_connection = paddle.nn.Identity()
elif use_conv:
@@ -266,7 +264,8 @@ def __init__(self,
channels,
self.out_channels,
(kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), )
+ padding=(padding_t, 1, 1),
+ )
else:
self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
@@ -317,11 +316,9 @@ def _forward(self, x, emb):
# return STTransformerClass
-def make_spatialtemporal_transformer(module_name="attention_temporal",
- class_name="SpatialTemporalTransformer"):
+def make_spatialtemporal_transformer(module_name="attention_temporal", class_name="SpatialTemporalTransformer"):
# Todo: Support loading more types of transformers
- assert (module_name == "attention_temporal" and
- class_name == "SpatialTemporalTransformer")
+ assert module_name == "attention_temporal" and class_name == "SpatialTemporalTransformer"
return SpatialTemporalTransformer
@@ -354,37 +351,39 @@ class LVDMUNet3DModel(ModelMixin, ConfigMixin):
"""
@register_to_config
- def __init__(self,
- image_size,
- in_channels,
- model_channels,
- out_channels,
- num_res_blocks,
- attention_resolutions,
- dropout=0,
- channel_mult=(1, 2, 4, 8),
- conv_resample=True,
- dims=3,
- num_classes=None,
- use_checkpoint=False,
- use_fp16=False,
- num_heads=-1,
- num_head_channels=-1,
- num_heads_upsample=-1,
- use_scale_shift_norm=False,
- resblock_updown=False,
- transformer_depth=1,
- context_dim=None,
- legacy=True,
- kernel_size_t=1,
- padding_t=1,
- use_temporal_transformer=False,
- temporal_length=None,
- use_relative_position=False,
- nonlinearity_type="silu",
- ST_transformer_module="attention_temporal",
- ST_transformer_class="SpatialTemporalTransformer",
- **kwargs):
+ def __init__(
+ self,
+ image_size,
+ in_channels,
+ model_channels,
+ out_channels,
+ num_res_blocks,
+ attention_resolutions,
+ dropout=0,
+ channel_mult=(1, 2, 4, 8),
+ conv_resample=True,
+ dims=3,
+ num_classes=None,
+ use_checkpoint=False,
+ use_fp16=False,
+ num_heads=-1,
+ num_head_channels=-1,
+ num_heads_upsample=-1,
+ use_scale_shift_norm=False,
+ resblock_updown=False,
+ transformer_depth=1,
+ context_dim=None,
+ legacy=True,
+ kernel_size_t=1,
+ padding_t=1,
+ use_temporal_transformer=False,
+ temporal_length=None,
+ use_relative_position=False,
+ nonlinearity_type="silu",
+ ST_transformer_module="attention_temporal",
+ ST_transformer_class="SpatialTemporalTransformer",
+ **kwargs
+ ):
super().__init__()
if use_temporal_transformer:
assert (
@@ -401,11 +400,9 @@ def __init__(self,
if num_heads_upsample == -1:
num_heads_upsample = num_heads
if num_heads == -1:
- assert (num_head_channels != -1
- ), "Either num_heads or num_head_channels has to be set"
+ assert num_head_channels != -1, "Either num_heads or num_head_channels has to be set"
if num_head_channels == -1:
- assert (num_heads != -1
- ), "Either num_heads or num_head_channels has to be set"
+ assert num_heads != -1, "Either num_heads or num_head_channels has to be set"
self.image_size = image_size
self.in_channels = in_channels
self.model_channels = model_channels
@@ -430,20 +427,26 @@ def __init__(self,
self.time_embed = paddle.nn.Sequential(
linear(model_channels, time_embed_dim),
nonlinearity(nonlinearity_type),
- linear(time_embed_dim, time_embed_dim), )
+ linear(time_embed_dim, time_embed_dim),
+ )
if self.num_classes is not None:
self.label_emb = paddle.nn.Embedding(num_classes, time_embed_dim)
STTransformerClass = make_spatialtemporal_transformer(
- module_name=ST_transformer_module, class_name=ST_transformer_class)
- self.input_blocks = paddle.nn.LayerList(sublayers=[
- TimestepEmbedSequential(
- conv_nd(
- dims,
- in_channels,
- model_channels,
- (kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), ))
- ])
+ module_name=ST_transformer_module, class_name=ST_transformer_class
+ )
+ self.input_blocks = paddle.nn.LayerList(
+ sublayers=[
+ TimestepEmbedSequential(
+ conv_nd(
+ dims,
+ in_channels,
+ model_channels,
+ (kernel_size_t, 3, 3),
+ padding=(padding_t, 1, 1),
+ )
+ )
+ ]
+ )
self._feature_size = model_channels
input_block_chans = [model_channels]
ch = model_channels
@@ -462,7 +465,8 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs)
+ **kwargs,
+ )
]
ch = mult * model_channels
if ds in attention_resolutions:
@@ -472,8 +476,7 @@ def __init__(self,
num_heads = ch // num_head_channels
dim_head = num_head_channels
if legacy:
- dim_head = (ch // num_heads if use_temporal_transformer
- else num_head_channels)
+ dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
layers.append(
STAttentionBlock(
ch,
@@ -481,8 +484,10 @@ def __init__(self,
num_heads=num_heads,
num_head_channels=dim_head,
temporal_length=temporal_length,
- use_relative_position=use_relative_position, )
- if not use_temporal_transformer else STTransformerClass(
+ use_relative_position=use_relative_position,
+ )
+ if not use_temporal_transformer
+ else STTransformerClass(
ch,
num_heads,
dim_head,
@@ -490,7 +495,9 @@ def __init__(self,
context_dim=context_dim,
temporal_length=temporal_length,
use_relative_position=use_relative_position,
- **kwargs))
+ **kwargs,
+ )
+ )
self.input_blocks.append(TimestepEmbedSequential(*layers))
self._feature_size += ch
input_block_chans.append(ch)
@@ -510,13 +517,19 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs) if resblock_updown else Downsample(
- ch,
- conv_resample,
- dims=dims,
- out_channels=out_ch,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t, )))
+ **kwargs,
+ )
+ if resblock_updown
+ else Downsample(
+ ch,
+ conv_resample,
+ dims=dims,
+ out_channels=out_ch,
+ kernel_size_t=kernel_size_t,
+ padding_t=padding_t,
+ )
+ )
+ )
ch = out_ch
input_block_chans.append(ch)
ds *= 2
@@ -527,8 +540,7 @@ def __init__(self,
num_heads = ch // num_head_channels
dim_head = num_head_channels
if legacy:
- dim_head = (ch // num_heads
- if use_temporal_transformer else num_head_channels)
+ dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
self.middle_block = TimestepEmbedSequential(
ResBlock(
ch,
@@ -540,15 +552,18 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs),
+ **kwargs,
+ ),
STAttentionBlock(
ch,
use_checkpoint=use_checkpoint,
num_heads=num_heads,
num_head_channels=dim_head,
temporal_length=temporal_length,
- use_relative_position=use_relative_position, )
- if not use_temporal_transformer else STTransformerClass(
+ use_relative_position=use_relative_position,
+ )
+ if not use_temporal_transformer
+ else STTransformerClass(
ch,
num_heads,
dim_head,
@@ -556,7 +571,8 @@ def __init__(self,
context_dim=context_dim,
temporal_length=temporal_length,
use_relative_position=use_relative_position,
- **kwargs),
+ **kwargs,
+ ),
ResBlock(
ch,
time_embed_dim,
@@ -567,7 +583,9 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs), )
+ **kwargs,
+ ),
+ )
self._feature_size += ch
self.output_blocks = paddle.nn.LayerList(sublayers=[])
for level, mult in list(enumerate(channel_mult))[::-1]:
@@ -585,7 +603,8 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs)
+ **kwargs,
+ )
]
ch = model_channels * mult
if ds in attention_resolutions:
@@ -595,8 +614,7 @@ def __init__(self,
num_heads = ch // num_head_channels
dim_head = num_head_channels
if legacy:
- dim_head = (ch // num_heads if use_temporal_transformer
- else num_head_channels)
+ dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
layers.append(
STAttentionBlock(
ch,
@@ -604,8 +622,10 @@ def __init__(self,
num_heads=num_heads,
num_head_channels=dim_head,
temporal_length=temporal_length,
- use_relative_position=use_relative_position, )
- if not use_temporal_transformer else STTransformerClass(
+ use_relative_position=use_relative_position,
+ )
+ if not use_temporal_transformer
+ else STTransformerClass(
ch,
num_heads,
dim_head,
@@ -613,7 +633,9 @@ def __init__(self,
context_dim=context_dim,
temporal_length=temporal_length,
use_relative_position=use_relative_position,
- **kwargs))
+ **kwargs,
+ )
+ )
if level and i == num_res_blocks:
out_ch = ch
layers.append(
@@ -629,13 +651,18 @@ def __init__(self,
kernel_size_t=kernel_size_t,
padding_t=padding_t,
nonlinearity_type=nonlinearity_type,
- **kwargs) if resblock_updown else Upsample(
- ch,
- conv_resample,
- dims=dims,
- out_channels=out_ch,
- kernel_size_t=kernel_size_t,
- padding_t=padding_t, ))
+ **kwargs,
+ )
+ if resblock_updown
+ else Upsample(
+ ch,
+ conv_resample,
+ dims=dims,
+ out_channels=out_ch,
+ kernel_size_t=kernel_size_t,
+ padding_t=padding_t,
+ )
+ )
ds //= 2
self.output_blocks.append(TimestepEmbedSequential(*layers))
self._feature_size += ch
@@ -648,7 +675,10 @@ def __init__(self,
model_channels,
out_channels,
(kernel_size_t, 3, 3),
- padding=(padding_t, 1, 1), )), )
+ padding=(padding_t, 1, 1),
+ )
+ ),
+ )
def convert_to_fp16(self):
"""
@@ -666,13 +696,7 @@ def convert_to_fp32(self):
self.middle_block.apply(fn=convert_module_to_f32)
self.output_blocks.apply(fn=convert_module_to_f32)
- def forward(self,
- x,
- timesteps=None,
- time_emb_replace=None,
- context=None,
- y=None,
- **kwargs):
+ def forward(self, x, timesteps=None, time_emb_replace=None, context=None, y=None, **kwargs):
"""
Apply the model to an input batch.
:param x: an [N x C x ...] Tensor of inputs.
@@ -683,13 +707,12 @@ def forward(self,
"""
hs = []
if time_emb_replace is None:
- t_emb = timestep_embedding(
- timesteps, self.model_channels, repeat_only=False)
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
emb = self.time_embed(t_emb)
else:
emb = time_emb_replace
if y is not None:
- assert y.shape == (x.shape[0], )
+ assert y.shape == (x.shape[0],)
emb = emb + self.label_emb(y)
h = x.astype(self.dtype)
for module in self.input_blocks:
@@ -711,42 +734,30 @@ class FrameInterpPredUNet(LVDMUNet3DModel):
may need to input `mask` to indicate condition, as well as noise level `s` for condition augmentation.
"""
- def __init__(self,
- image_size,
- in_channels,
- cond_aug_mode=None,
- *args,
- **kwargs):
+ def __init__(self, image_size, in_channels, cond_aug_mode=None, *args, **kwargs):
super().__init__(image_size, in_channels, *args, **kwargs)
if cond_aug_mode == "time_embed":
self.time_embed_cond = paddle.nn.Sequential(
linear(self.model_channels, self.time_embed_dim),
nonlinearity(self.nonlinearity_type),
- linear(self.time_embed_dim, self.time_embed_dim), )
+ linear(self.time_embed_dim, self.time_embed_dim),
+ )
elif cond_aug_mode == "learned_embed":
pass
- def forward(self,
- x,
- timesteps,
- context=None,
- y=None,
- s=None,
- mask=None,
- **kwargs):
+ def forward(self, x, timesteps, context=None, y=None, s=None, mask=None, **kwargs):
if s is not None:
- s_emb = timestep_embedding(
- s, self.model_channels, repeat_only=False)
+ s_emb = timestep_embedding(s, self.model_channels, repeat_only=False)
s_emb = self.time_embed_cond(s_emb)
- t_emb = timestep_embedding(
- timesteps, self.model_channels, repeat_only=False)
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
emb = self.time_embed(t_emb)
assert emb.dim() == 2
mask_ = mask[:, :, :, (0), (0)]
t = mask.shape[2]
- emb_mix = (emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) *
- (1 - mask_) + s_emb.unsqueeze(axis=2).tile(
- repeat_times=[1, 1, t]) * mask_)
+ emb_mix = (
+ emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * (1 - mask_)
+ + s_emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * mask_
+ )
assert emb_mix.dim() == 3
emb_mix = rearrange(emb_mix, "b c t -> b t c")
time_emb_replace = emb_mix
@@ -754,10 +765,4 @@ def forward(self,
else:
time_emb_replace = None
timesteps = timesteps
- return super().forward(
- x,
- timesteps,
- time_emb_replace=time_emb_replace,
- context=context,
- y=y,
- **kwargs)
+ return super().forward(x, timesteps, time_emb_replace=time_emb_replace, context=context, y=y, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_util.py b/ppdiffusers/ppdiffusers/models/lvdm_util.py
index 18551f6900d0f..a3c8faa7fb7fe 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_util.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_util.py
@@ -27,7 +27,7 @@ def make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2):
"""
mask = paddle.zeros(shape=[t])
mask[:n_interp1] = 1
- mask[t - n_interp2:] = 1
+ mask[t - n_interp2 :] = 1
return mask
@@ -42,14 +42,15 @@ def make_interp_mask_with_framestride(t, device, frame_stride):
def random_temporal_masking(
- input_shape,
- p_interp,
- p_pred,
- device,
- n_interp1=1,
- n_interp2=1,
- n_prevs=[1],
- interp_frame_stride=None, ):
+ input_shape,
+ p_interp,
+ p_pred,
+ device,
+ n_interp1=1,
+ n_interp2=1,
+ n_prevs=[1],
+ interp_frame_stride=None,
+):
"""return mask for masking input, where 1 indicates given real image as condition,
0 indicates noisy samples.
"""
@@ -61,11 +62,9 @@ def random_temporal_masking(
r = random.random()
if r < p_interp:
if interp_frame_stride is not None:
- mask[i] = make_interp_mask_with_framestride(t, device,
- interp_frame_stride)
+ mask[i] = make_interp_mask_with_framestride(t, device, interp_frame_stride)
else:
- mask[i] = make_interp_mask_with_bothsidescond(
- t, device, n_interp1, n_interp2)
+ mask[i] = make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2)
elif p_interp <= r < p_interp + p_pred:
n_pred = random.choice(n_prevs)
mask[(i), :n_pred] = 1
@@ -76,51 +75,35 @@ def random_temporal_masking(
return mask
-def make_beta_schedule(schedule,
- n_timestep,
- linear_start=0.0001,
- linear_end=0.02,
- cosine_s=0.008):
+def make_beta_schedule(schedule, n_timestep, linear_start=0.0001, linear_end=0.02, cosine_s=0.008):
if schedule == "linear":
- betas = (paddle.linspace(
- start=linear_start**0.5, stop=linear_end**0.5,
- num=n_timestep).astype("float64")**2)
+ betas = (
+ paddle.linspace(start=linear_start**0.5, stop=linear_end**0.5, num=n_timestep).astype("float64") ** 2
+ )
elif schedule == "cosine":
- timesteps = (paddle.arange(end=n_timestep + 1).astype("float64") /
- n_timestep + cosine_s)
+ timesteps = paddle.arange(end=n_timestep + 1).astype("float64") / n_timestep + cosine_s
alphas = timesteps / (1 + cosine_s) * np.pi / 2
alphas = paddle.cos(x=alphas).pow(y=2)
alphas = alphas / alphas[0]
betas = 1 - alphas[1:] / alphas[:-1]
betas = np.clip(betas, a_min=0, a_max=0.999)
elif schedule == "sqrt_linear":
- betas = paddle.linspace(
- start=linear_start, stop=linear_end,
- num=n_timestep).astype("float64")
+ betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64")
elif schedule == "sqrt":
- betas = (paddle.linspace(
- start=linear_start, stop=linear_end,
- num=n_timestep).astype("float64")**0.5)
+ betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64") ** 0.5
else:
raise ValueError(f"schedule '{schedule}' unknown.")
return betas.numpy()
-def make_ddim_timesteps(ddim_discr_method,
- num_ddim_timesteps,
- num_ddpm_timesteps,
- verbose=True):
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
if ddim_discr_method == "uniform":
c = num_ddpm_timesteps // num_ddim_timesteps
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
elif ddim_discr_method == "quad":
- ddim_timesteps = (np.linspace(0,
- np.sqrt(num_ddpm_timesteps * 0.8),
- num_ddim_timesteps)**2).astype(int)
+ ddim_timesteps = (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps) ** 2).astype(int)
else:
- raise NotImplementedError(
- f'There is no ddim discretization method called "{ddim_discr_method}"'
- )
+ raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
steps_out = ddim_timesteps + 1
if verbose:
print(f"Selected timesteps for ddim sampler: {steps_out}")
@@ -129,14 +112,10 @@ def make_ddim_timesteps(ddim_discr_method,
def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
alphas = alphacums[ddim_timesteps]
- alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]]
- .tolist())
- sigmas = eta * np.sqrt(
- (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+ alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+ sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
if verbose:
- print(
- f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}"
- )
+ print(f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}")
print(
f"For the chosen value of eta, which is {eta}, this results in the following sigma_t schedule for ddim sampler {sigmas}"
)
@@ -165,7 +144,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
def extract_into_tensor(a, t, x_shape):
b, *_ = t.shape
out = a.take_along_axis(axis=-1, indices=t)
- return out.reshape([b, *((1, ) * (len(x_shape) - 1))])
+ return out.reshape([b, *((1,) * (len(x_shape) - 1))])
def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
@@ -179,14 +158,13 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
"""
if not repeat_only:
half = dim // 2
- freqs = paddle.exp(x=(-math.log(max_period) * paddle.arange(
- start=0, end=half).astype("float32") / half).astype("float32"))
+ freqs = paddle.exp(
+ x=(-math.log(max_period) * paddle.arange(start=0, end=half).astype("float32") / half).astype("float32")
+ )
args = timesteps[:, (None)].astype(dtype="float32") * freqs[None]
- embedding = paddle.concat(
- x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1)
+ embedding = paddle.concat(x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1)
if dim % 2:
- embedding = paddle.concat(
- x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1)
+ embedding = paddle.concat(x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1)
else:
embedding = repeat(timesteps, "b -> b d", d=dim)
return embedding
@@ -232,7 +210,8 @@ def Normalize(in_channels):
num_channels=in_channels,
epsilon=1e-06,
weight_attr=None,
- bias_attr=None, )
+ bias_attr=None,
+ )
def identity(*args, **kwargs):
@@ -249,8 +228,7 @@ def nonlinearity(type="silu"):
class GEGLU(paddle.nn.Layer):
def __init__(self, dim_in, dim_out):
super().__init__()
- self.proj = paddle.nn.Linear(
- in_features=dim_in, out_features=dim_out * 2)
+ self.proj = paddle.nn.Linear(in_features=dim_in, out_features=dim_out * 2)
def forward(self, x):
x, gate = self.proj(x).chunk(chunks=2, axis=-1)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_vae.py b/ppdiffusers/ppdiffusers/models/lvdm_vae.py
index 88c1e8a5ac1f0..089afdf908e94 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_vae.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_vae.py
@@ -24,11 +24,7 @@
def conv3d(in_channels, out_channels, kernel_size, conv3d_type="SamePadConv3d"):
if conv3d_type == "SamePadConv3d":
- return SamePadConv3d(
- in_channels,
- out_channels,
- kernel_size=kernel_size,
- padding_type="replicate")
+ return SamePadConv3d(in_channels, out_channels, kernel_size=kernel_size, padding_type="replicate")
else:
raise NotImplementedError
@@ -50,23 +46,24 @@ class AutoencoderKLOutput(BaseOutput):
class LVDMAutoencoderKL(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- n_hiddens=32,
- downsample=[4, 8, 8],
- z_channels=4,
- double_z=True,
- image_channel=3,
- norm_type="group",
- padding_type="replicate",
- upsample=[4, 8, 8],
- embed_dim=4,
- # ckpt_path=None,
- # ignore_keys=[],
- image_key="image",
- monitor=None,
- std=1.0,
- mean=0.0,
- prob=0.2, ):
+ self,
+ n_hiddens=32,
+ downsample=[4, 8, 8],
+ z_channels=4,
+ double_z=True,
+ image_channel=3,
+ norm_type="group",
+ padding_type="replicate",
+ upsample=[4, 8, 8],
+ embed_dim=4,
+ # ckpt_path=None,
+ # ignore_keys=[],
+ image_key="image",
+ monitor=None,
+ std=1.0,
+ mean=0.0,
+ prob=0.2,
+ ):
super().__init__()
self.image_key = image_key
# pass init params to Encoder
@@ -77,7 +74,8 @@ def __init__(
double_z=double_z,
image_channel=image_channel,
norm_type=norm_type,
- padding_type=padding_type, )
+ padding_type=padding_type,
+ )
# pass init params to Decoder
self.decoder = Decoder(
@@ -85,7 +83,8 @@ def __init__(
upsample=upsample,
z_channels=z_channels,
image_channel=image_channel,
- norm_type="group", )
+ norm_type="group",
+ )
self.quant_conv = conv3d(2 * z_channels, 2 * embed_dim, 1)
self.post_quant_conv = conv3d(embed_dim, z_channels, 1)
diff --git a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
index bf8d26d5beaf5..213b2efdd2ca9 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
@@ -20,9 +20,7 @@
#####################
-def convert_pytorch_state_dict_to_paddle(pt_state_dict,
- paddle_model: nn.Layer,
- sub_layer=None):
+def convert_pytorch_state_dict_to_paddle(pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
# Step 1: Find Linear layer which need transpose weight
linear_need_transpose = []
for k, v in paddle_model.named_sublayers(include_self=True):
@@ -51,7 +49,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict,
pt_tensor = pt_tensor.T
# (2) 0d tensor -> 1d tensor
if pt_tensor.ndim == 0:
- pt_tensor = pt_tensor.reshape((1, ))
+ pt_tensor = pt_tensor.reshape((1,))
# (3) name mapping
for old_key, new_key in ptname2pdname.items():
pt_key = pt_key.replace(old_key, new_key)
@@ -61,10 +59,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict,
@classmethod
-def convert_pytorch_state_dict_to_paddle_class_method(cls,
- pt_state_dict,
- paddle_model: nn.Layer,
- sub_layer=None):
+def convert_pytorch_state_dict_to_paddle_class_method(cls, pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
# Step 1: Find Linear layer which need transpose weight
linear_need_transpose = []
for k, v in paddle_model.named_sublayers(include_self=True):
@@ -96,7 +91,7 @@ def convert_pytorch_state_dict_to_paddle_class_method(cls,
pt_tensor = pt_tensor.T
# (2) 0d tensor -> 1d tensor
if pt_tensor.ndim == 0:
- pt_tensor = pt_tensor.reshape((1, ))
+ pt_tensor = pt_tensor.reshape((1,))
# (3) name mapping
for old_key, new_key in ptname2pdname.items():
pt_key = pt_key.replace(old_key, new_key)
@@ -137,9 +132,7 @@ def convert_paddle_state_dict_to_pytorch(pd_state_dict, paddle_model: nn.Layer):
pd_key = pd_key.replace(new_key, old_key)
if hasattr(paddle_model, "paddle_torch_name_mapping"):
pd_key = paddle_model.paddle_torch_name_mapping.get(pd_key, pd_key)
- pytorch_state_dict[pd_key] = (pd_tensor.contiguous()
- if hasattr(pd_tensor, "contiguous") else
- pd_tensor)
+ pytorch_state_dict[pd_key] = pd_tensor.contiguous() if hasattr(pd_tensor, "contiguous") else pd_tensor
return pytorch_state_dict
diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
index 27514475bc7c2..bf9ed3663d724 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_utils.py
@@ -21,16 +21,33 @@
import paddle
import paddle.nn as nn
-from ..utils import (CONFIG_NAME, DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB,
- HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT,
- PADDLE_WEIGHTS_NAME, PPDIFFUSERS_CACHE, TO_DIFFUSERS,
- TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME,
- _add_variant, _get_model_file, deprecate,
- is_paddlenlp_available, is_safetensors_available,
- is_torch_available, is_torch_file, logging, smart_load)
+from ..utils import (
+ CONFIG_NAME,
+ DIFFUSERS_CACHE,
+ FROM_DIFFUSERS,
+ FROM_HF_HUB,
+ HF_HUB_OFFLINE,
+ LOW_CPU_MEM_USAGE_DEFAULT,
+ PADDLE_WEIGHTS_NAME,
+ PPDIFFUSERS_CACHE,
+ TO_DIFFUSERS,
+ TORCH_SAFETENSORS_WEIGHTS_NAME,
+ TORCH_WEIGHTS_NAME,
+ _add_variant,
+ _get_model_file,
+ deprecate,
+ is_paddlenlp_available,
+ is_safetensors_available,
+ is_torch_available,
+ is_torch_file,
+ logging,
+ smart_load,
+)
from ..version import VERSION as __version__
from .modeling_pytorch_paddle_utils import (
- convert_paddle_state_dict_to_pytorch, convert_pytorch_state_dict_to_paddle)
+ convert_paddle_state_dict_to_pytorch,
+ convert_pytorch_state_dict_to_paddle,
+)
logger = logging.get_logger(__name__)
@@ -87,11 +104,7 @@ def convert_state_dict(state_dict, framework="torch"):
state_dict = {k: v.cpu().numpy() for k, v in state_dict.items()}
return state_dict
elif framework in ["paddle", "pd"]:
- state_dict = {
- k: paddle.to_tensor(
- v, place="cpu")
- for k, v in state_dict.items()
- }
+ state_dict = {k: paddle.to_tensor(v, place="cpu") for k, v in state_dict.items()}
return state_dict
else:
raise NotImplementedError(f"Not Implemented {framework} framework!")
@@ -129,9 +142,7 @@ class ModelMixin(nn.Layer):
[`~models.ModelMixin.save_pretrained`].
"""
config_name = CONFIG_NAME
- _automatically_saved_args = [
- "_ppdiffusers_version", "_class_name", "_name_or_path"
- ]
+ _automatically_saved_args = ["_ppdiffusers_version", "_class_name", "_name_or_path"]
_supports_gradient_checkpointing = False
def __init__(self):
@@ -144,8 +155,7 @@ def __getattr__(self, name: str) -> Any:
https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
"""
- is_in_config = "_internal_dict" in self.__dict__ and hasattr(
- self.__dict__["_internal_dict"], name)
+ is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
is_attribute = name in self.__dict__
if is_in_config and not is_attribute:
@@ -155,7 +165,8 @@ def __getattr__(self, name: str) -> Any:
"1.0.0",
deprecation_message,
standard_warn=False,
- stacklevel=3, )
+ stacklevel=3,
+ )
return self._internal_dict[name]
# call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
@@ -171,7 +182,8 @@ def is_gradient_checkpointing(self) -> bool:
"""
return any(
hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing
- for m in self.sublayers(include_self=True))
+ for m in self.sublayers(include_self=True)
+ )
def enable_gradient_checkpointing(self):
"""
@@ -181,9 +193,7 @@ def enable_gradient_checkpointing(self):
activations".
"""
if not self._supports_gradient_checkpointing:
- raise ValueError(
- f"{self.__class__.__name__} does not support gradient checkpointing."
- )
+ raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
self.apply(partial(self._set_gradient_checkpointing, value=True))
def disable_gradient_checkpointing(self):
@@ -196,15 +206,13 @@ def disable_gradient_checkpointing(self):
if self._supports_gradient_checkpointing:
self.apply(partial(self._set_gradient_checkpointing, value=False))
- def set_use_memory_efficient_attention_xformers(
- self, valid: bool, attention_op: Optional[str]=None) -> None:
+ def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
# Recursively walk through all the children.
# Any children which exposes the set_use_memory_efficient_attention_xformers method
# gets the message
def fn_recursive_set_mem_eff(module: nn.Layer):
if hasattr(module, "set_use_memory_efficient_attention_xformers"):
- module.set_use_memory_efficient_attention_xformers(valid,
- attention_op)
+ module.set_use_memory_efficient_attention_xformers(valid, attention_op)
for child in module.children():
fn_recursive_set_mem_eff(child)
@@ -213,8 +221,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer):
if isinstance(module, nn.Layer):
fn_recursive_set_mem_eff(module)
- def enable_xformers_memory_efficient_attention(
- self, attention_op: Optional[str]=None):
+ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
r"""
Enable memory efficient attention as implemented in xformers.
@@ -249,13 +256,14 @@ def disable_xformers_memory_efficient_attention(self):
self.set_use_memory_efficient_attention_xformers(False)
def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- is_main_process: bool=True,
- save_function: Callable=None,
- safe_serialization: bool=False,
- variant: Optional[str]=None,
- to_diffusers: Optional[bool]=None, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ is_main_process: bool = True,
+ save_function: Callable = None,
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ to_diffusers: Optional[bool] = None,
+ ):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
`[`~models.ModelMixin.from_pretrained`]` class method.
@@ -280,16 +288,11 @@ def save_pretrained(
"""
if to_diffusers is None:
to_diffusers = TO_DIFFUSERS
- if to_diffusers and safe_serialization and not is_safetensors_available(
- ):
- raise ImportError(
- "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
- )
+ if to_diffusers and safe_serialization and not is_safetensors_available():
+ raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
if os.path.isfile(save_directory):
- logger.error(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
os.makedirs(save_directory, exist_ok=True)
@@ -314,14 +317,11 @@ def save_pretrained(
if safe_serialization:
if is_torch_available():
save_function = safetensors_torch_save_file
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
else:
save_function = safetensors_numpy_save_file
- state_dict = convert_state_dict(
- state_dict, framework="numpy")
- weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME,
- variant)
+ state_dict = convert_state_dict(state_dict, framework="numpy")
+ weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant)
else:
if not is_torch_available():
raise ImportError(
@@ -329,11 +329,9 @@ def save_pretrained(
)
save_function = torch.save
weights_name = _add_variant(TORCH_WEIGHTS_NAME, variant)
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
- state_dict = convert_paddle_state_dict_to_pytorch(state_dict,
- model_to_save)
+ state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
else:
save_function = paddle.save
weights_name = _add_variant(PADDLE_WEIGHTS_NAME, variant)
@@ -341,15 +339,10 @@ def save_pretrained(
# Save the model
save_function(state_dict, os.path.join(save_directory, weights_name))
- logger.info(
- f"Model weights saved in {os.path.join(save_directory, weights_name)}"
- )
+ logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
@classmethod
- def from_pretrained(
- cls,
- pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
- **kwargs):
+ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
r"""
Instantiate a pretrained pytorch model from a pre-trained model configuration.
@@ -425,8 +418,9 @@ def from_pretrained(
"""
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
force_download = kwargs.pop("force_download", False)
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
@@ -439,13 +433,11 @@ def from_pretrained(
paddle_dtype = kwargs.pop("paddle_dtype", None)
subfolder = kwargs.pop("subfolder", None)
ignore_keys = kwargs.pop("ignore_keys", None)
- low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
- LOW_CPU_MEM_USAGE_DEFAULT)
+ low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None)
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -476,7 +468,8 @@ def from_pretrained(
subfolder=subfolder,
user_agent=user_agent,
from_hf_hub=from_hf_hub, # whether or not from_hf_hub
- **kwargs, )
+ **kwargs,
+ )
# This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
# Load model
@@ -486,8 +479,7 @@ def from_pretrained(
try:
model_file = _get_model_file(
pretrained_model_name_or_path,
- weights_name=_add_variant(
- TORCH_SAFETENSORS_WEIGHTS_NAME, variant),
+ weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant),
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -498,7 +490,8 @@ def from_pretrained(
subfolder=subfolder,
user_agent=user_agent,
commit_hash=commit_hash,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
# try load model_file with paddle / torch / safetensor
state_dict = smart_load(model_file)
except Exception:
@@ -518,7 +511,8 @@ def from_pretrained(
subfolder=subfolder,
user_agent=user_agent,
commit_hash=commit_hash,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
# try load model_file with paddle / torch / safetensor
state_dict = smart_load(model_file)
else:
@@ -535,18 +529,19 @@ def from_pretrained(
subfolder=subfolder,
user_agent=user_agent,
commit_hash=commit_hash,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
# try load model_file with paddle / torch / safetensor
state_dict = smart_load(model_file)
init_contexts = []
- dtype = set(v.dtype for v in state_dict.values()
- if paddle.is_tensor(v) and paddle.is_floating_point(v))
+ dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
if len(dtype) > 1 and paddle.float32 not in dtype:
raise ValueError(
f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
- f" make sure that {model_file} weights have only one dtype.")
+ f" make sure that {model_file} weights have only one dtype."
+ )
elif len(dtype) > 1 and paddle.float32 in dtype:
dtype = paddle.float32
elif len(dtype) == 0:
@@ -580,21 +575,16 @@ def from_pretrained(
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
- logger.warning(
- "Deleting key {} from state_dict.".format(k))
+ logger.warning("Deleting key {} from state_dict.".format(k))
del state_dict[k]
- (
+ (model, missing_keys, unexpected_keys, mismatched_keys, error_msgs,) = cls._load_pretrained_model(
model,
- missing_keys,
- unexpected_keys,
- mismatched_keys,
- error_msgs, ) = cls._load_pretrained_model(
- model,
- state_dict,
- model_file,
- pretrained_model_name_or_path,
- ignore_mismatched_sizes=ignore_mismatched_sizes, )
+ state_dict,
+ model_file,
+ pretrained_model_name_or_path,
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
+ )
loading_info = {
"missing_keys": missing_keys,
@@ -621,12 +611,13 @@ def from_pretrained(
@classmethod
def _load_pretrained_model(
- cls,
- model,
- state_dict,
- resolved_archive_file,
- pretrained_model_name_or_path,
- ignore_mismatched_sizes=False, ):
+ cls,
+ model,
+ state_dict,
+ resolved_archive_file,
+ pretrained_model_name_or_path,
+ ignore_mismatched_sizes=False,
+ ):
# Retrieve missing & unexpected_keys
model_state_dict = model.state_dict()
loaded_keys = list(state_dict.keys())
@@ -642,21 +633,25 @@ def _load_pretrained_model(
model_to_load = model
def _find_mismatched_keys(
- state_dict,
- model_state_dict,
- loaded_keys,
- ignore_mismatched_sizes, ):
+ state_dict,
+ model_state_dict,
+ loaded_keys,
+ ignore_mismatched_sizes,
+ ):
mismatched_keys = []
for checkpoint_key in loaded_keys:
model_key = checkpoint_key
- if model_key in model_state_dict and list(state_dict[
- checkpoint_key].shape) != list(model_state_dict[
- model_key].shape):
- mismatched_keys.append((
- checkpoint_key,
- state_dict[checkpoint_key].shape,
- model_state_dict[model_key].shape, ))
+ if model_key in model_state_dict and list(state_dict[checkpoint_key].shape) != list(
+ model_state_dict[model_key].shape
+ ):
+ mismatched_keys.append(
+ (
+ checkpoint_key,
+ state_dict[checkpoint_key].shape,
+ model_state_dict[model_key].shape,
+ )
+ )
del state_dict[checkpoint_key]
if ignore_mismatched_sizes:
mismatched_keys = []
@@ -668,7 +663,8 @@ def _find_mismatched_keys(
state_dict,
model_state_dict,
original_loaded_keys,
- ignore_mismatched_sizes, )
+ ignore_mismatched_sizes,
+ )
error_msgs = []
for key_name, loaded_shape, model_shape in mismatched_keys:
error_msgs.append(
@@ -679,10 +675,10 @@ def _find_mismatched_keys(
if len(error_msgs) > 0:
error_msg = "\n\t".join(error_msgs)
if "size mismatch" in error_msg:
- error_msg += "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
- raise RuntimeError(
- f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}"
- )
+ error_msg += (
+ "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+ )
+ raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
if len(unexpected_keys) > 0:
logger.warning(
@@ -693,11 +689,10 @@ def _find_mismatched_keys(
" BertForPreTraining model).\n- This IS NOT expected if you are initializing"
f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
" identical (initializing a BertForSequenceClassification model from a"
- " BertForSequenceClassification model).")
- else:
- logger.info(
- f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
+ " BertForSequenceClassification model)."
)
+ else:
+ logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
@@ -709,17 +704,21 @@ def _find_mismatched_keys(
f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
- " without further training.")
+ " without further training."
+ )
if len(mismatched_keys) > 0:
- mismatched_warning = "\n".join([
- f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
- for key, shape1, shape2 in mismatched_keys
- ])
+ mismatched_warning = "\n".join(
+ [
+ f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+ for key, shape1, shape2 in mismatched_keys
+ ]
+ )
logger.warning(
f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
- " able to use it for predictions and inference.")
+ " able to use it for predictions and inference."
+ )
return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
@@ -738,9 +737,7 @@ def dtype(self) -> paddle.dtype:
"""
return get_parameter_dtype(self)
- def num_parameters(self,
- only_trainable: bool=False,
- exclude_embeddings: bool=False) -> int:
+ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
"""
Get number of (optionally, trainable or non-embeddings) parameters in the module.
@@ -762,14 +759,11 @@ def num_parameters(self,
if isinstance(module_type, nn.Embedding)
]
non_embedding_parameters = [
- parameter for name, parameter in self.named_parameters()
- if name not in embedding_param_names
+ parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
]
- return sum(p.numel() for p in non_embedding_parameters
- if not p.stop_gradient or not only_trainable)
+ return sum(p.numel() for p in non_embedding_parameters if not p.stop_gradient or not only_trainable)
else:
- return sum(p.numel() for p in self.parameters()
- if not p.stop_gradient or not only_trainable)
+ return sum(p.numel() for p in self.parameters() if not p.stop_gradient or not only_trainable)
def unfreeze_params(params):
diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py
index 8d1b6af0782a0..90c1da6ee3232 100644
--- a/ppdiffusers/ppdiffusers/models/prior_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/prior_transformer.py
@@ -65,14 +65,15 @@ class PriorTransformer(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_attention_heads: int=32,
- attention_head_dim: int=64,
- num_layers: int=20,
- embedding_dim: int=768,
- num_embeddings=77,
- additional_embeddings=4,
- dropout: float=0.0, ):
+ self,
+ num_attention_heads: int = 32,
+ attention_head_dim: int = 64,
+ num_layers: int = 20,
+ embedding_dim: int = 768,
+ num_embeddings=77,
+ additional_embeddings=4,
+ dropout: float = 0.0,
+ ):
super().__init__()
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
@@ -90,20 +91,26 @@ def __init__(
self.positional_embedding = self.create_parameter(
(1, num_embeddings + additional_embeddings, inner_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
self.prd_embedding = self.create_parameter(
(1, 1, inner_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
- self.transformer_blocks = nn.LayerList([
- BasicTransformerBlock(
- inner_dim,
- num_attention_heads,
- attention_head_dim,
- dropout=dropout,
- activation_fn="gelu",
- attention_bias=True, ) for d in range(num_layers)
- ])
+ default_initializer=nn.initializer.Constant(0.0),
+ )
+ self.transformer_blocks = nn.LayerList(
+ [
+ BasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ activation_fn="gelu",
+ attention_bias=True,
+ )
+ for d in range(num_layers)
+ ]
+ )
self.norm_out = nn.LayerNorm(inner_dim)
self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
@@ -114,29 +121,33 @@ def __init__(
num_embeddings + additional_embeddings,
num_embeddings + additional_embeddings,
],
- NEG_INF, ),
- 1, )
+ NEG_INF,
+ ),
+ 1,
+ )
causal_attention_mask = causal_attention_mask.unsqueeze(0)
- self.register_buffer(
- "causal_attention_mask", causal_attention_mask, persistable=False)
+ self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False)
self.clip_mean = self.create_parameter(
(1, embedding_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
self.clip_std = self.create_parameter(
(1, embedding_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
def forward(
- self,
- hidden_states,
- timestep: Union[paddle.Tensor, float, int],
- proj_embedding: paddle.Tensor,
- encoder_hidden_states: paddle.Tensor,
- attention_mask: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ):
+ self,
+ hidden_states,
+ timestep: Union[paddle.Tensor, float, int],
+ proj_embedding: paddle.Tensor,
+ encoder_hidden_states: paddle.Tensor,
+ attention_mask: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ):
"""
Args:
hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
@@ -168,8 +179,7 @@ def forward(
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps * paddle.ones(
- (batch_size, ), dtype=timesteps.dtype)
+ timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype)
timesteps_projected = self.time_proj(timesteps)
@@ -179,13 +189,10 @@ def forward(
time_embeddings = self.time_embedding(timesteps_projected)
proj_embeddings = self.embedding_proj(proj_embedding)
- encoder_hidden_states = self.encoder_hidden_states_proj(
- encoder_hidden_states)
+ encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
hidden_states = self.proj_in(hidden_states)
- prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand(
- [batch_size, -1, -1])
- positional_embeddings = self.positional_embedding.cast(
- hidden_states.dtype)
+ prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1])
+ positional_embeddings = self.positional_embedding.cast(hidden_states.dtype)
hidden_states = paddle.concat(
[
@@ -195,23 +202,21 @@ def forward(
hidden_states[:, None, :],
prd_embedding,
],
- axis=1, )
+ axis=1,
+ )
hidden_states = hidden_states + positional_embeddings
if attention_mask is not None:
- attention_mask = (
- 1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF
+ attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF
attention_mask = F.pad(
attention_mask.unsqueeze(0),
(0, self.additional_embeddings),
value=0.0,
- data_format="NCL", ).squeeze(0)
- attention_mask = (
- attention_mask[:, None, :] + self.causal_attention_mask
- ).cast(hidden_states.dtype)
- attention_mask = attention_mask.repeat_interleave(
- self.config.num_attention_heads, axis=0)
+ data_format="NCL",
+ ).squeeze(0)
+ attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype)
+ attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0)
for block in self.transformer_blocks:
hidden_states = block(hidden_states, attention_mask=attention_mask)
@@ -221,10 +226,9 @@ def forward(
predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
if not return_dict:
- return (predicted_image_embedding, )
+ return (predicted_image_embedding,)
- return PriorTransformerOutput(
- predicted_image_embedding=predicted_image_embedding)
+ return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
def post_process_latents(self, prior_latents):
prior_latents = (prior_latents * self.clip_std) + self.clip_mean
diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py
index 60998dc3fc1b7..39bf23c59264d 100644
--- a/ppdiffusers/ppdiffusers/models/resnet.py
+++ b/ppdiffusers/ppdiffusers/models/resnet.py
@@ -37,12 +37,13 @@ class Upsample1D(nn.Layer):
"""
def __init__(
- self,
- channels,
- use_conv=False,
- use_conv_transpose=False,
- out_channels=None,
- name="conv", ):
+ self,
+ channels,
+ use_conv=False,
+ use_conv_transpose=False,
+ out_channels=None,
+ name="conv",
+ ):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -54,8 +55,7 @@ def __init__(
if use_conv_transpose:
self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, 2, 1)
elif use_conv:
- self.conv = nn.Conv1D(
- self.channels, self.out_channels, 3, padding=1)
+ self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1)
def forward(self, x):
assert x.shape[1] == self.channels
@@ -81,12 +81,7 @@ class Downsample1D(nn.Layer):
padding:
"""
- def __init__(self,
- channels,
- use_conv=False,
- out_channels=None,
- padding=1,
- name="conv"):
+ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -96,12 +91,7 @@ def __init__(self,
self.name = name
if use_conv:
- self.conv = nn.Conv1D(
- self.channels,
- self.out_channels,
- 3,
- stride=stride,
- padding=padding)
+ self.conv = nn.Conv1D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
else:
assert self.channels == self.out_channels
self.conv = nn.AvgPool1D(kernel_size=stride, stride=stride)
@@ -123,12 +113,13 @@ class Upsample2D(nn.Layer):
"""
def __init__(
- self,
- channels,
- use_conv=False,
- use_conv_transpose=False,
- out_channels=None,
- name="conv", ):
+ self,
+ channels,
+ use_conv=False,
+ use_conv_transpose=False,
+ out_channels=None,
+ name="conv",
+ ):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -164,11 +155,9 @@ def forward(self, hidden_states, output_size=None):
# if `output_size` is passed we force the interpolation output
# size and do not make use of `scale_factor=2`
if output_size is None:
- hidden_states = F.interpolate(
- hidden_states, scale_factor=2.0, mode="nearest")
+ hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
else:
- hidden_states = F.interpolate(
- hidden_states, size=output_size, mode="nearest")
+ hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
# If the input is bfloat16, we cast back to bfloat16
if dtype == paddle.bfloat16:
@@ -195,12 +184,7 @@ class Downsample2D(nn.Layer):
padding:
"""
- def __init__(self,
- channels,
- use_conv=False,
- out_channels=None,
- padding=1,
- name="conv"):
+ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
@@ -210,12 +194,7 @@ def __init__(self,
self.name = name
if use_conv:
- conv = nn.Conv2D(
- self.channels,
- self.out_channels,
- 3,
- stride=stride,
- padding=padding)
+ conv = nn.Conv2D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
else:
assert self.channels == self.out_channels
conv = nn.AvgPool2D(kernel_size=stride, stride=stride)
@@ -242,26 +221,16 @@ def forward(self, hidden_states):
class FirUpsample2D(nn.Layer):
- def __init__(self,
- channels=None,
- out_channels=None,
- use_conv=False,
- fir_kernel=(1, 3, 3, 1)):
+ def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
super().__init__()
out_channels = out_channels if out_channels else channels
if use_conv:
- self.Conv2d_0 = nn.Conv2D(
- channels, out_channels, kernel_size=3, stride=1, padding=1)
+ self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
self.use_conv = use_conv
self.fir_kernel = fir_kernel
self.out_channels = out_channels
- def _upsample_2d(self,
- hidden_states,
- weight=None,
- kernel=None,
- factor=2,
- gain=1):
+ def _upsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
"""Fused `upsample_2d()` followed by `Conv2d()`.
Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
@@ -307,12 +276,12 @@ def _upsample_2d(self,
# Determine data dimensions.
output_shape = (
(hidden_states.shape[2] - 1) * factor + convH,
- (hidden_states.shape[3] - 1) * factor + convW, )
+ (hidden_states.shape[3] - 1) * factor + convW,
+ )
output_padding = (
- output_shape[0] -
- (hidden_states.shape[2] - 1) * stride[0] - convH,
- output_shape[1] -
- (hidden_states.shape[3] - 1) * stride[1] - convW, )
+ output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
+ output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
+ )
assert output_padding[0] >= 0 and output_padding[1] >= 0
num_groups = hidden_states.shape[1] // inC
@@ -326,55 +295,46 @@ def _upsample_2d(self,
weight,
stride=stride,
output_padding=output_padding,
- padding=0, )
+ padding=0,
+ )
output = upfirdn2d_native(
inverse_conv,
paddle.to_tensor(kernel),
- pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1), )
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
+ )
else:
pad_value = kernel.shape[0] - factor
output = upfirdn2d_native(
hidden_states,
paddle.to_tensor(kernel),
up=factor,
- pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), )
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+ )
return output
def forward(self, hidden_states):
if self.use_conv:
- height = self._upsample_2d(
- hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
+ height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
height = height + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
else:
- height = self._upsample_2d(
- hidden_states, kernel=self.fir_kernel, factor=2)
+ height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
return height
class FirDownsample2D(nn.Layer):
- def __init__(self,
- channels=None,
- out_channels=None,
- use_conv=False,
- fir_kernel=(1, 3, 3, 1)):
+ def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
super().__init__()
out_channels = out_channels if out_channels else channels
if use_conv:
- self.Conv2d_0 = nn.Conv2D(
- channels, out_channels, kernel_size=3, stride=1, padding=1)
+ self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
self.fir_kernel = fir_kernel
self.use_conv = use_conv
self.out_channels = out_channels
- def _downsample_2d(self,
- hidden_states,
- weight=None,
- kernel=None,
- factor=2,
- gain=1):
+ def _downsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
"""Fused `Conv2d()` followed by `downsample_2d()`.
Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
@@ -414,30 +374,26 @@ def _downsample_2d(self,
upfirdn_input = upfirdn2d_native(
hidden_states,
paddle.to_tensor(kernel),
- pad=((pad_value + 1) // 2, pad_value // 2), )
- output = F.conv2d(
- upfirdn_input, weight, stride=stride_value, padding=0)
+ pad=((pad_value + 1) // 2, pad_value // 2),
+ )
+ output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
else:
pad_value = kernel.shape[0] - factor
output = upfirdn2d_native(
hidden_states,
paddle.to_tensor(kernel),
down=factor,
- pad=((pad_value + 1) // 2, pad_value // 2), )
+ pad=((pad_value + 1) // 2, pad_value // 2),
+ )
return output
def forward(self, hidden_states):
if self.use_conv:
- downsample_input = self._downsample_2d(
- hidden_states,
- weight=self.Conv2d_0.weight,
- kernel=self.fir_kernel)
- hidden_states = downsample_input + self.Conv2d_0.bias.reshape(
- [1, -1, 1, 1])
+ downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+ hidden_states = downsample_input + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
else:
- hidden_states = self._downsample_2d(
- hidden_states, kernel=self.fir_kernel, factor=2)
+ hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
return hidden_states
@@ -451,18 +407,16 @@ def __init__(self, pad_mode="reflect"):
self.pad = kernel_1d.shape[1] // 2 - 1
self.register_buffer(
"kernel",
- paddle.matmul(
- kernel_1d, kernel_1d, transpose_x=True),
- persistable=False, )
+ paddle.matmul(kernel_1d, kernel_1d, transpose_x=True),
+ persistable=False,
+ )
def forward(self, x):
- x = F.pad(x, (self.pad, ) * 4, self.pad_mode)
+ x = F.pad(x, (self.pad,) * 4, self.pad_mode)
weight = paddle.zeros(
- [
- x.shape[1], x.shape[1], self.kernel.shape[0],
- self.kernel.shape[1]
- ],
- dtype=x.dtype, )
+ [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]],
+ dtype=x.dtype,
+ )
indices = paddle.arange(x.shape[1])
# TODO verify this method
weight[indices, indices] = self.kernel.cast(weight.dtype)
@@ -477,18 +431,16 @@ def __init__(self, pad_mode="reflect"):
self.pad = kernel_1d.shape[1] // 2 - 1
self.register_buffer(
"kernel",
- paddle.matmul(
- kernel_1d, kernel_1d, transpose_x=True),
- persistable=False, )
+ paddle.matmul(kernel_1d, kernel_1d, transpose_x=True),
+ persistable=False,
+ )
def forward(self, x):
- x = F.pad(x, ((self.pad + 1) // 2, ) * 4, self.pad_mode)
+ x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
weight = paddle.zeros(
- [
- x.shape[1], x.shape[1], self.kernel.shape[0],
- self.kernel.shape[1]
- ],
- dtype=x.dtype, )
+ [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]],
+ dtype=x.dtype,
+ )
indices = paddle.arange(x.shape[1])
# TODO verify this method
weight[indices, indices] = self.kernel.cast(weight.dtype)
@@ -527,28 +479,28 @@ class ResnetBlock2D(nn.Layer):
"""
def __init__(
- self,
- *,
- in_channels,
- out_channels=None,
- conv_shortcut=False,
- dropout=0.0,
- temb_channels=512,
- groups=32,
- groups_out=None,
- pre_norm=True,
- eps=1e-6,
- non_linearity="swish",
- skip_time_act: bool=False, # skip_time_act is the same as pre_temb_non_linearity
- time_embedding_norm="default", # default, scale_shift, ada_group
- kernel=None,
- output_scale_factor=1.0,
- use_in_shortcut=None,
- up=False,
- down=False,
- conv_shortcut_bias: bool=True,
- conv_2d_out_channels: Optional[int]=None,
- pre_temb_non_linearity: bool=False, # skip_time_act is the same as pre_temb_non_linearity
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout=0.0,
+ temb_channels=512,
+ groups=32,
+ groups_out=None,
+ pre_norm=True,
+ eps=1e-6,
+ non_linearity="swish",
+ skip_time_act: bool = False, # skip_time_act is the same as pre_temb_non_linearity
+ time_embedding_norm="default", # default, scale_shift, ada_group
+ kernel=None,
+ output_scale_factor=1.0,
+ use_in_shortcut=None,
+ up=False,
+ down=False,
+ conv_shortcut_bias: bool = True,
+ conv_2d_out_channels: Optional[int] = None,
+ pre_temb_non_linearity: bool = False, # skip_time_act is the same as pre_temb_non_linearity
):
super().__init__()
self.pre_temb_non_linearity = pre_temb_non_linearity
@@ -568,14 +520,11 @@ def __init__(
groups_out = groups
if self.time_embedding_norm == "ada_group":
- self.norm1 = AdaGroupNorm(
- temb_channels, in_channels, groups, eps=eps)
+ self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
else:
- self.norm1 = nn.GroupNorm(
- num_groups=groups, num_channels=in_channels, epsilon=eps)
+ self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
- self.conv1 = nn.Conv2D(
- in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+ self.conv1 = nn.Conv2D(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
if temb_channels is not None:
if self.time_embedding_norm == "default":
@@ -585,26 +534,18 @@ def __init__(
elif self.time_embedding_norm == "ada_group":
self.time_emb_proj = None
else:
- raise ValueError(
- f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
else:
self.time_emb_proj = None
if self.time_embedding_norm == "ada_group":
- self.norm2 = AdaGroupNorm(
- temb_channels, out_channels, groups_out, eps=eps)
+ self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
else:
- self.norm2 = nn.GroupNorm(
- num_groups=groups_out, num_channels=out_channels, epsilon=eps)
+ self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
self.dropout = nn.Dropout(dropout)
conv_2d_out_channels = conv_2d_out_channels or out_channels
- self.conv2 = nn.Conv2D(
- out_channels,
- conv_2d_out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
+ self.conv2 = nn.Conv2D(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
if non_linearity == "swish":
self.nonlinearity = lambda x: F.silu(x)
@@ -621,8 +562,7 @@ def __init__(
fir_kernel = (1, 3, 3, 1)
self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
elif kernel == "sde_vp":
- self.upsample = partial(
- F.interpolate, scale_factor=2.0, mode="nearest")
+ self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
else:
self.upsample = Upsample2D(in_channels, use_conv=False)
elif self.down:
@@ -632,11 +572,9 @@ def __init__(
elif kernel == "sde_vp":
self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
else:
- self.downsample = Downsample2D(
- in_channels, use_conv=False, padding=1, name="op")
+ self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
- self.use_in_shortcut = (self.in_channels != conv_2d_out_channels
- if use_in_shortcut is None else use_in_shortcut)
+ self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
self.conv_shortcut = None
if self.use_in_shortcut:
@@ -646,7 +584,8 @@ def __init__(
kernel_size=1,
stride=1,
padding=0,
- bias_attr=conv_shortcut_bias, )
+ bias_attr=conv_shortcut_bias,
+ )
def forward(self, input_tensor, temb):
hidden_states = input_tensor
@@ -693,8 +632,7 @@ def forward(self, input_tensor, temb):
input_tensor = self.conv_shortcut(input_tensor)
# TODO this maybe result -inf, input_tensor's min value -57644 hidden_states's min value -10000
- output_tensor = (
- input_tensor + hidden_states) / self.output_scale_factor
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
return output_tensor
@@ -724,8 +662,7 @@ class Conv1dBlock(nn.Layer):
def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
super().__init__()
- self.conv1d = nn.Conv1D(
- inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+ self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
self.group_norm = nn.GroupNorm(n_groups, out_channels)
self.mish = nn.Mish()
@@ -748,8 +685,9 @@ def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
self.time_emb_act = nn.Mish()
self.time_emb = nn.Linear(embed_dim, out_channels)
- self.residual_conv = (nn.Conv1D(inp_channels, out_channels, 1) if
- inp_channels != out_channels else nn.Identity())
+ self.residual_conv = (
+ nn.Conv1D(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+ )
def forward(self, x, t):
"""
@@ -799,7 +737,8 @@ def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):
hidden_states,
kernel,
up=factor,
- pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), )
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+ )
return output
@@ -832,11 +771,7 @@ def downsample_2d(hidden_states, kernel=None, factor=2, gain=1):
kernel = kernel * gain
pad_value = kernel.shape[0] - factor
- output = upfirdn2d_native(
- hidden_states,
- kernel,
- down=factor,
- pad=((pad_value + 1) // 2, pad_value // 2))
+ output = upfirdn2d_native(hidden_states, kernel, down=factor, pad=((pad_value + 1) // 2, pad_value // 2))
return output
@@ -854,9 +789,11 @@ def dummy_pad(tensor, up_x=0, up_y=0):
up_x,
tensor.shape[5],
],
- dtype=tensor.dtype, ),
+ dtype=tensor.dtype,
+ ),
],
- axis=4, )
+ axis=4,
+ )
if up_y > 0:
tensor = paddle.concat(
[
@@ -870,9 +807,11 @@ def dummy_pad(tensor, up_x=0, up_y=0):
tensor.shape[4],
tensor.shape[5],
],
- dtype=tensor.dtype, ),
+ dtype=tensor.dtype,
+ ),
],
- axis=2, )
+ axis=2,
+ )
return tensor
@@ -900,23 +839,29 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
out = F.pad(
out,
[max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0), 0, 0],
- data_format="NDHWC", )
+ data_format="NDHWC",
+ )
out = out.squeeze(0)
- out = out[:, max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), max(
- -pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
+ out = out[
+ :,
+ max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+ max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+ :,
+ ]
out = out.transpose([0, 3, 1, 2])
- out = out.reshape(
- [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+ out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
w = paddle.flip(kernel, [0, 1]).reshape([1, 1, kernel_h, kernel_w])
out = F.conv2d(out, w)
- out = out.reshape([
- -1,
- minor,
- in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
- in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
- ])
+ out = out.reshape(
+ [
+ -1,
+ minor,
+ in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+ in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+ ]
+ )
out = out.transpose([0, 2, 3, 1])
out = out[:, ::down_y, ::down_x, :]
@@ -938,44 +883,48 @@ def __init__(self, in_dim, out_dim=None, dropout=0.0):
self.in_dim = in_dim
self.out_dim = out_dim
self.conv1 = nn.Sequential(
- nn.GroupNorm(
- num_groups=32, num_channels=in_dim),
+ nn.GroupNorm(num_groups=32, num_channels=in_dim),
nn.Silu(),
nn.Conv3D(
in_channels=in_dim,
out_channels=out_dim,
kernel_size=(3, 1, 1),
- padding=(1, 0, 0), ), )
+ padding=(1, 0, 0),
+ ),
+ )
self.conv2 = nn.Sequential(
- nn.GroupNorm(
- num_groups=32, num_channels=out_dim),
+ nn.GroupNorm(num_groups=32, num_channels=out_dim),
nn.Silu(),
nn.Dropout(p=dropout),
nn.Conv3D(
in_channels=out_dim,
out_channels=in_dim,
kernel_size=(3, 1, 1),
- padding=(1, 0, 0), ), )
+ padding=(1, 0, 0),
+ ),
+ )
self.conv3 = nn.Sequential(
- nn.GroupNorm(
- num_groups=32, num_channels=out_dim),
+ nn.GroupNorm(num_groups=32, num_channels=out_dim),
nn.Silu(),
nn.Dropout(p=dropout),
nn.Conv3D(
in_channels=out_dim,
out_channels=in_dim,
kernel_size=(3, 1, 1),
- padding=(1, 0, 0), ), )
+ padding=(1, 0, 0),
+ ),
+ )
self.conv4 = nn.Sequential(
- nn.GroupNorm(
- num_groups=32, num_channels=out_dim),
+ nn.GroupNorm(num_groups=32, num_channels=out_dim),
nn.Silu(),
nn.Dropout(p=dropout),
nn.Conv3D(
in_channels=out_dim,
out_channels=in_dim,
kernel_size=(3, 1, 1),
- padding=(1, 0, 0), ), )
+ padding=(1, 0, 0),
+ ),
+ )
zeros_(self.conv4[-1].weight)
zeros_(self.conv4[-1].bias)
@@ -983,14 +932,15 @@ def forward(self, hidden_states, num_frames=1):
hidden_states = (
hidden_states[None, :]
.reshape((-1, num_frames) + tuple(hidden_states.shape[1:]))
- .transpose(perm=[0, 2, 1, 3, 4]))
+ .transpose(perm=[0, 2, 1, 3, 4])
+ )
identity = hidden_states
hidden_states = self.conv1(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.conv3(hidden_states)
hidden_states = self.conv4(hidden_states)
hidden_states = identity + hidden_states
- hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape((
- hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(
- hidden_states.shape[3:]))
+ hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(
+ (hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:])
+ )
return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
index fabe9f4eaec86..2d0a45bcc46c9 100644
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
@@ -26,31 +26,30 @@
class T5FilmDecoder(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- input_dims: int=128,
- targets_length: int=256,
- max_decoder_noise_time: float=2000.0,
- d_model: int=768,
- num_layers: int=12,
- num_heads: int=12,
- d_kv: int=64,
- d_ff: int=2048,
- dropout_rate: float=0.1, ):
+ self,
+ input_dims: int = 128,
+ targets_length: int = 256,
+ max_decoder_noise_time: float = 2000.0,
+ d_model: int = 768,
+ num_layers: int = 12,
+ num_heads: int = 12,
+ d_kv: int = 64,
+ d_ff: int = 2048,
+ dropout_rate: float = 0.1,
+ ):
super().__init__()
self.conditioning_emb = nn.Sequential(
- nn.Linear(
- d_model, d_model * 4, bias_attr=False),
+ nn.Linear(d_model, d_model * 4, bias_attr=False),
nn.Silu(),
- nn.Linear(
- d_model * 4, d_model * 4, bias_attr=False),
- nn.Silu(), )
+ nn.Linear(d_model * 4, d_model * 4, bias_attr=False),
+ nn.Silu(),
+ )
self.position_encoding = nn.Embedding(targets_length, d_model)
self.position_encoding.weight.stop_gradient = True
- self.continuous_inputs_projection = nn.Linear(
- input_dims, d_model, bias_attr=False)
+ self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias_attr=False)
self.dropout = nn.Dropout(p=dropout_rate)
@@ -62,7 +61,8 @@ def __init__(
d_kv=d_kv,
num_heads=num_heads,
d_ff=d_ff,
- dropout_rate=dropout_rate, )
+ dropout_rate=dropout_rate,
+ )
self.decoders.append(lyr)
self.decoder_norm = T5LayerNorm(d_model)
@@ -71,13 +71,10 @@ def __init__(
self.spec_out = nn.Linear(d_model, input_dims, bias_attr=False)
def encoder_decoder_mask(self, query_input, key_input):
- mask = paddle.multiply(
- query_input.unsqueeze(-1),
- key_input.unsqueeze(-2).cast(query_input.dtype))
+ mask = paddle.multiply(query_input.unsqueeze(-1), key_input.unsqueeze(-2).cast(query_input.dtype))
return mask.unsqueeze(-3)
- def forward(self, encodings_and_masks, decoder_input_tokens,
- decoder_noise_time):
+ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
batch, _, _ = decoder_input_tokens.shape
assert decoder_noise_time.shape[0] == batch
@@ -85,7 +82,8 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
time_steps = get_timestep_embedding(
decoder_noise_time * self.config.max_decoder_noise_time,
embedding_dim=self.config.d_model,
- max_period=self.config.max_decoder_noise_time, ).cast(self.dtype)
+ max_period=self.config.max_decoder_noise_time,
+ ).cast(self.dtype)
conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
@@ -96,37 +94,34 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
# If we want to use relative positions for audio context, we can just offset
# this sequence by the length of encodings_and_masks.
decoder_positions = paddle.broadcast_to(
- paddle.arange(seq_length, ),
- shape=(batch, seq_length), )
+ paddle.arange(
+ seq_length,
+ ),
+ shape=(batch, seq_length),
+ )
position_encodings = self.position_encoding(decoder_positions)
- inputs = self.continuous_inputs_projection(
- decoder_input_tokens.cast(position_encodings.dtype))
+ inputs = self.continuous_inputs_projection(decoder_input_tokens.cast(position_encodings.dtype))
inputs += position_encodings
y = self.dropout(inputs)
# decoder: No padding present.
- decoder_mask = paddle.ones(
- decoder_input_tokens.shape[:2], dtype=inputs.dtype)
+ decoder_mask = paddle.ones(decoder_input_tokens.shape[:2], dtype=inputs.dtype)
# Translate encoding masks to encoder-decoder masks.
- encodings_and_encdec_masks = [
- (x, self.encoder_decoder_mask(decoder_mask, y))
- for x, y in encodings_and_masks
- ]
+ encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
# cross attend style: concat encodings
- encoded = paddle.concat(
- [x[0] for x in encodings_and_encdec_masks], axis=1)
- encoder_decoder_mask = paddle.concat(
- [x[1] for x in encodings_and_encdec_masks], axis=-1)
+ encoded = paddle.concat([x[0] for x in encodings_and_encdec_masks], axis=1)
+ encoder_decoder_mask = paddle.concat([x[1] for x in encodings_and_encdec_masks], axis=-1)
for lyr in self.decoders:
y = lyr(
y,
conditioning_emb=conditioning_emb,
encoder_hidden_states=encoded,
- encoder_attention_mask=encoder_decoder_mask, )[0]
+ encoder_attention_mask=encoder_decoder_mask,
+ )[0]
y = self.decoder_norm(y)
y = self.post_dropout(y)
@@ -136,13 +131,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
class DecoderLayer(nn.Layer):
- def __init__(self,
- d_model,
- d_kv,
- num_heads,
- d_ff,
- dropout_rate,
- layer_norm_epsilon=1e-6):
+ def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
super().__init__()
self.layer = nn.LayerList()
@@ -152,7 +141,9 @@ def __init__(self,
d_model=d_model,
d_kv=d_kv,
num_heads=num_heads,
- dropout_rate=dropout_rate, ))
+ dropout_rate=dropout_rate,
+ )
+ )
# cross attention: layer 1
self.layer.append(
@@ -161,7 +152,9 @@ def __init__(self,
d_kv=d_kv,
num_heads=num_heads,
dropout_rate=dropout_rate,
- layer_norm_epsilon=layer_norm_epsilon, ))
+ layer_norm_epsilon=layer_norm_epsilon,
+ )
+ )
# Film Cond MLP + dropout: last layer
self.layer.append(
@@ -169,62 +162,67 @@ def __init__(self,
d_model=d_model,
d_ff=d_ff,
dropout_rate=dropout_rate,
- layer_norm_epsilon=layer_norm_epsilon, ))
+ layer_norm_epsilon=layer_norm_epsilon,
+ )
+ )
def forward(
- self,
- hidden_states,
- conditioning_emb=None,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- encoder_decoder_position_bias=None, ):
+ self,
+ hidden_states,
+ conditioning_emb=None,
+ attention_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ encoder_decoder_position_bias=None,
+ ):
hidden_states = self.layer[0](
hidden_states,
conditioning_emb=conditioning_emb,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
if encoder_hidden_states is not None:
- encoder_extended_attention_mask = paddle.where(
- encoder_attention_mask > 0, 0.0,
- -1e10).cast(encoder_hidden_states.dtype)
+ encoder_extended_attention_mask = paddle.where(encoder_attention_mask > 0, 0.0, -1e10).cast(
+ encoder_hidden_states.dtype
+ )
hidden_states = self.layer[1](
hidden_states,
key_value_states=encoder_hidden_states,
- attention_mask=encoder_extended_attention_mask, )
+ attention_mask=encoder_extended_attention_mask,
+ )
# Apply Film Conditional Feed Forward layer
hidden_states = self.layer[-1](hidden_states, conditioning_emb)
- return (hidden_states, )
+ return (hidden_states,)
class T5LayerSelfAttentionCond(nn.Layer):
def __init__(self, d_model, d_kv, num_heads, dropout_rate):
super().__init__()
self.layer_norm = T5LayerNorm(d_model)
- self.FiLMLayer = T5FiLMLayer(
- in_features=d_model * 4, out_features=d_model)
+ self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
self.attention = Attention(
query_dim=d_model,
heads=num_heads,
dim_head=d_kv,
out_bias=False,
- scale_qk=False, )
+ scale_qk=False,
+ )
self.dropout = nn.Dropout(dropout_rate)
def forward(
- self,
- hidden_states,
- conditioning_emb=None,
- attention_mask=None, ):
+ self,
+ hidden_states,
+ conditioning_emb=None,
+ attention_mask=None,
+ ):
# pre_self_attention_layer_norm
normed_hidden_states = self.layer_norm(hidden_states)
if conditioning_emb is not None:
- normed_hidden_states = self.FiLMLayer(normed_hidden_states,
- conditioning_emb)
+ normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
# Self-attention block
attention_output = self.attention(normed_hidden_states)
@@ -235,28 +233,30 @@ def forward(
class T5LayerCrossAttention(nn.Layer):
- def __init__(self, d_model, d_kv, num_heads, dropout_rate,
- layer_norm_epsilon):
+ def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
super().__init__()
self.attention = Attention(
query_dim=d_model,
heads=num_heads,
dim_head=d_kv,
out_bias=False,
- scale_qk=False, )
+ scale_qk=False,
+ )
self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
self.dropout = nn.Dropout(dropout_rate)
def forward(
- self,
- hidden_states,
- key_value_states=None,
- attention_mask=None, ):
+ self,
+ hidden_states,
+ key_value_states=None,
+ attention_mask=None,
+ ):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.attention(
normed_hidden_states,
encoder_hidden_states=key_value_states,
- attention_mask=attention_mask.squeeze(1), )
+ attention_mask=attention_mask.squeeze(1),
+ )
layer_output = hidden_states + self.dropout(attention_output)
return layer_output
@@ -264,8 +264,7 @@ def forward(
class T5LayerFFCond(nn.Layer):
def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
super().__init__()
- self.DenseReluDense = T5DenseGatedActDense(
- d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+ self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
self.dropout = nn.Dropout(dropout_rate)
@@ -306,9 +305,7 @@ class T5LayerNorm(nn.Layer):
def __init__(self, hidden_size, eps=1e-6):
super().__init__()
- self.weight = self.create_parameter(
- shape=[hidden_size],
- default_initializer=nn.initializer.Constant(1.0))
+ self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
self.variance_epsilon = eps
def forward(self, hidden_states):
@@ -317,10 +314,8 @@ def forward(self, hidden_states):
# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
# half-precision inputs is done in fp32
- variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(
- axis=-1, keepdim=True)
- hidden_states = hidden_states * paddle.rsqrt(variance +
- self.variance_epsilon)
+ variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(axis=-1, keepdim=True)
+ hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
# convert into half-precision if necessary
if self.weight.dtype == paddle.float16:
@@ -335,9 +330,9 @@ class NewGELUActivation(nn.Layer):
"""
def forward(self, input: paddle.Tensor) -> paddle.Tensor:
- return (0.5 * input * (1.0 + paddle.tanh(
- math.sqrt(2.0 / math.pi) *
- (input + 0.044715 * paddle.pow(input, 3.0)))))
+ return (
+ 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+ )
class T5FiLMLayer(nn.Layer):
@@ -347,8 +342,7 @@ class T5FiLMLayer(nn.Layer):
def __init__(self, in_features, out_features):
super().__init__()
- self.scale_bias = nn.Linear(
- in_features, out_features * 2, bias_attr=False)
+ self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False)
def forward(self, x, conditioning_emb):
emb = self.scale_bias(conditioning_emb)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
index e9f47cbee3f7b..2207b8b46974e 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py
@@ -79,26 +79,27 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_attention_heads: int=16,
- attention_head_dim: int=88,
- in_channels: Optional[int]=None,
- out_channels: Optional[int]=None,
- num_layers: int=1,
- dropout: float=0.0,
- norm_num_groups: int=32,
- cross_attention_dim: Optional[int]=None,
- attention_bias: bool=False,
- sample_size: Optional[int]=None,
- num_vector_embeds: Optional[int]=None,
- patch_size: Optional[int]=None,
- activation_fn: str="geglu",
- num_embeds_ada_norm: Optional[int]=None,
- use_linear_projection: bool=False,
- only_cross_attention: bool=False,
- upcast_attention: bool=False,
- norm_type: str="layer_norm",
- norm_elementwise_affine: bool=True, ):
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm",
+ norm_elementwise_affine: bool = True,
+ ):
super().__init__()
self.use_linear_projection = use_linear_projection
self.num_attention_heads = num_attention_heads
@@ -107,8 +108,7 @@ def __init__(
# 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
# Define whether input is continuous or discrete depending on configuration
- self.is_input_continuous = (in_channels is not None) and (
- patch_size is None)
+ self.is_input_continuous = (in_channels is not None) and (patch_size is None)
self.is_input_vectorized = num_vector_embeds is not None
self.is_input_patches = in_channels is not None and patch_size is not None
@@ -124,7 +124,8 @@ def __init__(
"norm_type!=num_embeds_ada_norm",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
norm_type = "ada_norm"
if self.is_input_continuous and self.is_input_vectorized:
@@ -137,8 +138,7 @@ def __init__(
f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
" sure that either `num_vector_embeds` or `num_patches` is None."
)
- elif (not self.is_input_continuous and not self.is_input_vectorized and
- not self.is_input_patches):
+ elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
raise ValueError(
f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
@@ -148,22 +148,14 @@ def __init__(
if self.is_input_continuous:
self.in_channels = in_channels
- self.norm = nn.GroupNorm(
- num_groups=norm_num_groups,
- num_channels=in_channels,
- epsilon=1e-6)
+ self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6)
if use_linear_projection:
self.proj_in = nn.Linear(in_channels, inner_dim)
else:
- self.proj_in = nn.Conv2D(
- in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+ self.proj_in = nn.Conv2D(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
elif self.is_input_vectorized:
- assert (
- sample_size is not None
- ), "Transformer2DModel over discrete input must provide sample_size"
- assert (
- num_vector_embeds is not None
- ), "Transformer2DModel over discrete input must provide num_embed"
+ assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+ assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
self.height = sample_size
self.width = sample_size
@@ -174,11 +166,10 @@ def __init__(
num_embed=num_vector_embeds,
embed_dim=inner_dim,
height=self.height,
- width=self.width, )
+ width=self.width,
+ )
elif self.is_input_patches:
- assert (
- sample_size is not None
- ), "Transformer2DModel over patched input must provide sample_size"
+ assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
self.height = sample_size
self.width = sample_size
@@ -189,25 +180,29 @@ def __init__(
width=sample_size,
patch_size=patch_size,
in_channels=in_channels,
- embed_dim=inner_dim, )
+ embed_dim=inner_dim,
+ )
# 3. Define transformers blocks
- self.transformer_blocks = nn.LayerList([
- BasicTransformerBlock(
- inner_dim,
- num_attention_heads,
- attention_head_dim,
- dropout=dropout,
- cross_attention_dim=cross_attention_dim,
- activation_fn=activation_fn,
- num_embeds_ada_norm=num_embeds_ada_norm,
- attention_bias=attention_bias,
- only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention,
- norm_type=norm_type,
- norm_elementwise_affine=norm_elementwise_affine, )
- for d in range(num_layers)
- ])
+ self.transformer_blocks = nn.LayerList(
+ [
+ BasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ cross_attention_dim=cross_attention_dim,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ attention_bias=attention_bias,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=norm_elementwise_affine,
+ )
+ for d in range(num_layers)
+ ]
+ )
# 4. Define output layers
self.out_channels = in_channels if out_channels is None else out_channels
@@ -216,8 +211,7 @@ def __init__(
if use_linear_projection:
self.proj_out = nn.Linear(inner_dim, in_channels)
else:
- self.proj_out = nn.Conv2D(
- inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+ self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
elif self.is_input_vectorized:
self.norm_out = nn.LayerNorm(inner_dim)
self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
@@ -226,17 +220,17 @@ def __init__(
norm_kwargs = {"weight_attr": False, "bias_attr": False}
self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
- self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size *
- self.out_channels)
+ self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
def forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- timestep=None,
- class_labels=None,
- cross_attention_kwargs=None,
- return_dict: bool=True, ):
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ class_labels=None,
+ cross_attention_kwargs=None,
+ return_dict: bool = True,
+ ):
"""
Args:
hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -270,8 +264,7 @@ def forward(
if self.use_linear_projection:
hidden_states = self.proj_in(hidden_states)
elif self.is_input_vectorized:
- hidden_states = self.latent_image_embedding(
- hidden_states.cast("int64"))
+ hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
elif self.is_input_patches:
hidden_states = self.pos_embed(hidden_states)
@@ -282,14 +275,14 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
timestep=timestep,
cross_attention_kwargs=cross_attention_kwargs,
- class_labels=class_labels, )
+ class_labels=class_labels,
+ )
# 3. Output
if self.is_input_continuous:
if self.use_linear_projection:
hidden_states = self.proj_out(hidden_states)
- hidden_states = hidden_states.reshape(
- [-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
+ hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
if not self.use_linear_projection:
hidden_states = self.proj_out(hidden_states)
output = hidden_states + residual
@@ -300,31 +293,32 @@ def forward(
logits = logits.transpose([0, 2, 1])
# log(p(x_0))
- output = F.log_softmax(
- logits.cast("float64"), axis=1).cast("float32")
+ output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
elif self.is_input_patches:
# TODO: cleanup!
conditioning = self.transformer_blocks[0].norm1.emb(
- timestep, class_labels, hidden_dtype=hidden_states.dtype)
- shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(
- 2, axis=1)
- hidden_states = (self.norm_out(hidden_states) *
- (1 + scale[:, None]) + shift[:, None])
+ timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
hidden_states = self.proj_out_2(hidden_states)
# unpatchify
- height = width = int(hidden_states.shape[1]**0.5)
+ height = width = int(hidden_states.shape[1] ** 0.5)
hidden_states = hidden_states.reshape(
- (-1, height, width, self.patch_size, self.patch_size,
- self.out_channels))
+ (-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+ )
hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states)
- output = hidden_states.reshape((
- -1,
- self.out_channels,
- height * self.patch_size,
- width * self.patch_size, ))
+ output = hidden_states.reshape(
+ (
+ -1,
+ self.out_channels,
+ height * self.patch_size,
+ width * self.patch_size,
+ )
+ )
if not return_dict:
- return (output, )
+ return (output,)
return Transformer2DModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
index 0052335c043f4..bfd1985eb99a7 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
@@ -60,52 +60,56 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_attention_heads: int=16,
- attention_head_dim: int=88,
- in_channels: Optional[int]=None,
- out_channels: Optional[int]=None,
- num_layers: int=1,
- dropout: float=0.0,
- norm_num_groups: int=32,
- cross_attention_dim: Optional[int]=None,
- attention_bias: bool=False,
- sample_size: Optional[int]=None,
- activation_fn: str="geglu",
- norm_elementwise_affine: bool=True,
- double_self_attention: bool=True, ):
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ activation_fn: str = "geglu",
+ norm_elementwise_affine: bool = True,
+ double_self_attention: bool = True,
+ ):
super().__init__()
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
inner_dim = num_attention_heads * attention_head_dim
self.in_channels = in_channels
- self.norm = nn.GroupNorm(
- num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
+ self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
self.proj_in = nn.Linear(in_channels, inner_dim)
- self.transformer_blocks = nn.LayerList([
- BasicTransformerBlock(
- inner_dim,
- num_attention_heads,
- attention_head_dim,
- dropout=dropout,
- cross_attention_dim=cross_attention_dim,
- activation_fn=activation_fn,
- attention_bias=attention_bias,
- double_self_attention=double_self_attention,
- norm_elementwise_affine=norm_elementwise_affine, )
- for d in range(num_layers)
- ])
+ self.transformer_blocks = nn.LayerList(
+ [
+ BasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ cross_attention_dim=cross_attention_dim,
+ activation_fn=activation_fn,
+ attention_bias=attention_bias,
+ double_self_attention=double_self_attention,
+ norm_elementwise_affine=norm_elementwise_affine,
+ )
+ for d in range(num_layers)
+ ]
+ )
self.proj_out = nn.Linear(inner_dim, in_channels)
def forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- timestep=None,
- class_labels=None,
- num_frames=1,
- cross_attention_kwargs=None,
- return_dict: bool=True, ):
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ class_labels=None,
+ num_frames=1,
+ cross_attention_kwargs=None,
+ return_dict: bool = True,
+ ):
"""
Args:
hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -131,12 +135,12 @@ def forward(
batch_frames, channel, height, width = hidden_states.shape
batch_size = batch_frames // num_frames
residual = hidden_states
- hidden_states = hidden_states[None, :].reshape(
- (batch_size, num_frames, channel, height, width))
+ hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width))
hidden_states = hidden_states.transpose([0, 2, 1, 3, 4])
hidden_states = self.norm(hidden_states)
hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape(
- (batch_size * height * width, num_frames, channel))
+ (batch_size * height * width, num_frames, channel)
+ )
hidden_states = self.proj_in(hidden_states)
# 2. Blocks
for block in self.transformer_blocks:
@@ -145,15 +149,17 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
timestep=timestep,
cross_attention_kwargs=cross_attention_kwargs,
- class_labels=class_labels, )
+ class_labels=class_labels,
+ )
# 3. Output
hidden_states = self.proj_out(hidden_states)
- hidden_states = (hidden_states[None, None, :].reshape(
- (batch_size, height, width, channel, num_frames))
- .transpose([0, 3, 4, 1, 2]))
- hidden_states = hidden_states.reshape(
- (batch_frames, channel, height, width))
+ hidden_states = (
+ hidden_states[None, None, :]
+ .reshape((batch_size, height, width, channel, num_frames))
+ .transpose([0, 3, 4, 1, 2])
+ )
+ hidden_states = hidden_states.reshape((batch_frames, channel, height, width))
output = hidden_states + residual
if not return_dict:
- return (output, )
+ return (output,)
return TransformerTemporalModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d.py b/ppdiffusers/ppdiffusers/models/unet_1d.py
index 70ecea668c88f..df62f8477b0bb 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d.py
@@ -23,8 +23,7 @@
from ..utils import BaseOutput
from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
from .modeling_utils import ModelMixin
-from .unet_1d_blocks import (get_down_block, get_mid_block, get_out_block,
- get_up_block)
+from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
@dataclass
@@ -73,29 +72,30 @@ class UNet1DModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- sample_size: int=65536,
- sample_rate: Optional[int]=None,
- in_channels: int=2,
- out_channels: int=2,
- extra_in_channels: int=0,
- time_embedding_type: str="fourier",
- flip_sin_to_cos: bool=True,
- use_timestep_embedding: bool=False,
- freq_shift: float=0.0,
- down_block_types: Tuple[str]=(
- "DownBlock1DNoSkip",
- "DownBlock1D",
- "AttnDownBlock1D", ),
- up_block_types: Tuple[str]=("AttnUpBlock1D", "UpBlock1D",
- "UpBlock1DNoSkip"),
- mid_block_type: Tuple[str]="UNetMidBlock1D",
- out_block_type: str=None,
- block_out_channels: Tuple[int]=(32, 32, 64),
- act_fn: str=None,
- norm_num_groups: int=8,
- layers_per_block: int=1,
- downsample_each_block: bool=False, ):
+ self,
+ sample_size: int = 65536,
+ sample_rate: Optional[int] = None,
+ in_channels: int = 2,
+ out_channels: int = 2,
+ extra_in_channels: int = 0,
+ time_embedding_type: str = "fourier",
+ flip_sin_to_cos: bool = True,
+ use_timestep_embedding: bool = False,
+ freq_shift: float = 0.0,
+ down_block_types: Tuple[str] = (
+ "DownBlock1DNoSkip",
+ "DownBlock1D",
+ "AttnDownBlock1D",
+ ),
+ up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+ mid_block_type: Tuple[str] = "UNetMidBlock1D",
+ out_block_type: str = None,
+ block_out_channels: Tuple[int] = (32, 32, 64),
+ act_fn: str = None,
+ norm_num_groups: int = 8,
+ layers_per_block: int = 1,
+ downsample_each_block: bool = False,
+ ):
super().__init__()
self.sample_size = sample_size
@@ -105,13 +105,15 @@ def __init__(
embedding_size=8,
set_W_to_weight=False,
log=False,
- flip_sin_to_cos=flip_sin_to_cos, )
+ flip_sin_to_cos=flip_sin_to_cos,
+ )
timestep_input_dim = 2 * block_out_channels[0]
elif time_embedding_type == "positional":
self.time_proj = Timesteps(
block_out_channels[0],
flip_sin_to_cos=flip_sin_to_cos,
- downscale_freq_shift=freq_shift, )
+ downscale_freq_shift=freq_shift,
+ )
timestep_input_dim = block_out_channels[0]
if use_timestep_embedding:
@@ -120,7 +122,8 @@ def __init__(
in_channels=timestep_input_dim,
time_embed_dim=time_embed_dim,
act_fn=act_fn,
- out_dim=block_out_channels[0], )
+ out_dim=block_out_channels[0],
+ )
self.down_blocks = nn.LayerList([])
self.mid_block = None
@@ -144,7 +147,8 @@ def __init__(
in_channels=input_channel,
out_channels=output_channel,
temb_channels=block_out_channels[0],
- add_downsample=not is_final_block or downsample_each_block, )
+ add_downsample=not is_final_block or downsample_each_block,
+ )
self.down_blocks.append(down_block)
# mid
@@ -155,7 +159,8 @@ def __init__(
out_channels=block_out_channels[-1],
embed_dim=block_out_channels[0],
num_layers=layers_per_block,
- add_downsample=downsample_each_block, )
+ add_downsample=downsample_each_block,
+ )
# up
reversed_block_out_channels = list(reversed(block_out_channels))
@@ -167,9 +172,9 @@ def __init__(
for i, up_block_type in enumerate(up_block_types):
prev_output_channel = output_channel
- output_channel = (reversed_block_out_channels[i + 1]
- if i < len(up_block_types) - 1 else
- final_upsample_channels)
+ output_channel = (
+ reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
+ )
is_final_block = i == len(block_out_channels) - 1
@@ -179,26 +184,28 @@ def __init__(
in_channels=prev_output_channel,
out_channels=output_channel,
temb_channels=block_out_channels[0],
- add_upsample=not is_final_block, )
+ add_upsample=not is_final_block,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
# out
- num_groups_out = (norm_num_groups if norm_num_groups is not None else
- min(block_out_channels[0] // 4, 32))
+ num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
self.out_block = get_out_block(
out_block_type=out_block_type,
num_groups_out=num_groups_out,
embed_dim=block_out_channels[0],
out_channels=out_channels,
act_fn=act_fn,
- fc_dim=block_out_channels[-1] // 4, )
+ fc_dim=block_out_channels[-1] // 4,
+ )
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- return_dict: bool=True, ) -> Union[UNet1DOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ return_dict: bool = True,
+ ) -> Union[UNet1DOutput, Tuple]:
r"""
Args:
sample (`paddle.Tensor`): `(batch_size, num_channels, sample_size)` noisy inputs tensor
@@ -223,16 +230,13 @@ def forward(
timestep_embed = self.time_mlp(timestep_embed)
else:
timestep_embed = timestep_embed[..., None]
- timestep_embed = timestep_embed.tile(
- [1, 1, sample.shape[2]]).cast(sample.dtype)
- timestep_embed = timestep_embed.broadcast_to(
- (sample.shape[:1] + timestep_embed.shape[1:]))
+ timestep_embed = timestep_embed.tile([1, 1, sample.shape[2]]).cast(sample.dtype)
+ timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:]))
# 2. down
down_block_res_samples = ()
for downsample_block in self.down_blocks:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=timestep_embed)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
down_block_res_samples += res_samples
# 3. mid
@@ -243,16 +247,13 @@ def forward(
for i, upsample_block in enumerate(self.up_blocks):
res_samples = down_block_res_samples[-1:]
down_block_res_samples = down_block_res_samples[:-1]
- sample = upsample_block(
- sample,
- res_hidden_states_tuple=res_samples,
- temb=timestep_embed)
+ sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
# 5. post-process
if self.out_block:
sample = self.out_block(sample, timestep_embed)
if not return_dict:
- return (sample, )
+ return (sample,)
return UNet1DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
index 7b3cf833bfba8..41a1810408693 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
@@ -20,24 +20,24 @@
from paddle import nn
from ..utils import is_ppxformers_available
-from .resnet import (Downsample1D, ResidualTemporalBlock1D, Upsample1D,
- rearrange_dims)
+from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
class DownResnetBlock1D(nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels=None,
- num_layers=1,
- conv_shortcut=False,
- temb_channels=32,
- groups=32,
- groups_out=None,
- non_linearity=None,
- time_embedding_norm="default",
- output_scale_factor=1.0,
- add_downsample=True, ):
+ self,
+ in_channels,
+ out_channels=None,
+ num_layers=1,
+ conv_shortcut=False,
+ temb_channels=32,
+ groups=32,
+ groups_out=None,
+ non_linearity=None,
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ add_downsample=True,
+ ):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
@@ -51,15 +51,10 @@ def __init__(
groups_out = groups
# there will always be at least one resnet
- resnets = [
- ResidualTemporalBlock1D(
- in_channels, out_channels, embed_dim=temb_channels)
- ]
+ resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
for _ in range(num_layers):
- resnets.append(
- ResidualTemporalBlock1D(
- out_channels, out_channels, embed_dim=temb_channels))
+ resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
self.resnets = nn.LayerList(resnets)
@@ -74,8 +69,7 @@ def __init__(
self.downsample = None
if add_downsample:
- self.downsample = Downsample1D(
- out_channels, use_conv=True, padding=1)
+ self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
def forward(self, hidden_states, temb=None):
output_states = ()
@@ -84,7 +78,7 @@ def forward(self, hidden_states, temb=None):
for resnet in self.resnets[1:]:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.nonlinearity is not None:
hidden_states = self.nonlinearity(hidden_states)
@@ -97,17 +91,18 @@ def forward(self, hidden_states, temb=None):
class UpResnetBlock1D(nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels=None,
- num_layers=1,
- temb_channels=32,
- groups=32,
- groups_out=None,
- non_linearity=None,
- time_embedding_norm="default",
- output_scale_factor=1.0,
- add_upsample=True, ):
+ self,
+ in_channels,
+ out_channels=None,
+ num_layers=1,
+ temb_channels=32,
+ groups=32,
+ groups_out=None,
+ non_linearity=None,
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
@@ -120,15 +115,10 @@ def __init__(
groups_out = groups
# there will always be at least one resnet
- resnets = [
- ResidualTemporalBlock1D(
- 2 * in_channels, out_channels, embed_dim=temb_channels)
- ]
+ resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
for _ in range(num_layers):
- resnets.append(
- ResidualTemporalBlock1D(
- out_channels, out_channels, embed_dim=temb_channels))
+ resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
self.resnets = nn.LayerList(resnets)
@@ -148,8 +138,7 @@ def __init__(
def forward(self, hidden_states, res_hidden_states_tuple=None, temb=None):
if res_hidden_states_tuple is not None:
res_hidden_states = res_hidden_states_tuple[-1]
- hidden_states = paddle.concat(
- (hidden_states, res_hidden_states), axis=1)
+ hidden_states = paddle.concat((hidden_states, res_hidden_states), axis=1)
hidden_states = self.resnets[0](hidden_states, temb)
for resnet in self.resnets[1:]:
@@ -171,11 +160,9 @@ def __init__(self, in_channels, out_channels, embed_dim):
self.out_channels = out_channels
self.embed_dim = embed_dim
- self.res1 = ResidualTemporalBlock1D(
- in_channels, in_channels // 2, embed_dim=embed_dim)
+ self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
self.down1 = Downsample1D(out_channels // 2, use_conv=True)
- self.res2 = ResidualTemporalBlock1D(
- in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+ self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
self.down2 = Downsample1D(out_channels // 4, use_conv=True)
def forward(self, x, temb=None):
@@ -188,29 +175,25 @@ def forward(self, x, temb=None):
class MidResTemporalBlock1D(nn.Layer):
def __init__(
- self,
- in_channels,
- out_channels,
- embed_dim,
- num_layers: int=1,
- add_downsample: bool=False,
- add_upsample: bool=False,
- non_linearity=None, ):
+ self,
+ in_channels,
+ out_channels,
+ embed_dim,
+ num_layers: int = 1,
+ add_downsample: bool = False,
+ add_upsample: bool = False,
+ non_linearity=None,
+ ):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.add_downsample = add_downsample
# there will always be at least one resnet
- resnets = [
- ResidualTemporalBlock1D(
- in_channels, out_channels, embed_dim=embed_dim)
- ]
+ resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
for _ in range(num_layers):
- resnets.append(
- ResidualTemporalBlock1D(
- out_channels, out_channels, embed_dim=embed_dim))
+ resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
self.resnets = nn.LayerList(resnets)
@@ -271,11 +254,13 @@ def forward(self, hidden_states, temb=None):
class OutValueFunctionBlock(nn.Layer):
def __init__(self, fc_dim, embed_dim):
super().__init__()
- self.final_block = nn.LayerList([
- nn.Linear(fc_dim + embed_dim, fc_dim // 2),
- nn.Mish(),
- nn.Linear(fc_dim // 2, 1),
- ])
+ self.final_block = nn.LayerList(
+ [
+ nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+ nn.Mish(),
+ nn.Linear(fc_dim // 2, 1),
+ ]
+ )
def forward(self, hidden_states, temb):
hidden_states = hidden_states.reshape([hidden_states.shape[0], -1])
@@ -324,15 +309,11 @@ def __init__(self, kernel="linear", pad_mode="reflect"):
self.register_buffer("kernel", kernel_1d)
def forward(self, hidden_states):
- hidden_states = F.pad(hidden_states, (self.pad, ) * 2,
- self.pad_mode,
- data_format="NCL")
+ hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode, data_format="NCL")
weight = paddle.zeros(
- [
- hidden_states.shape[1], hidden_states.shape[1],
- self.kernel.shape[0]
- ],
- dtype=hidden_states.dtype, )
+ [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]],
+ dtype=hidden_states.dtype,
+ )
indices = paddle.arange(hidden_states.shape[1])
weight[indices, indices] = self.kernel.cast(weight.dtype)
return F.conv1d(hidden_states, weight, stride=2)
@@ -347,19 +328,14 @@ def __init__(self, kernel="linear", pad_mode="reflect"):
self.register_buffer("kernel", kernel_1d)
def forward(self, hidden_states, temb=None):
- hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2, ) * 2,
- self.pad_mode,
- data_format="NCL")
+ hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode, data_format="NCL")
weight = paddle.zeros(
- [
- hidden_states.shape[1], hidden_states.shape[1],
- self.kernel.shape[0]
- ],
- dtype=hidden_states.dtype, )
+ [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]],
+ dtype=hidden_states.dtype,
+ )
indices = paddle.arange(hidden_states.shape[1])
weight[indices, indices] = self.kernel.cast(weight.dtype)
- return F.conv1d_transpose(
- hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+ return F.conv1d_transpose(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
class SelfAttention1d(nn.Layer):
@@ -395,9 +371,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True):
return tensor
def set_use_memory_efficient_attention_xformers(
- self,
- use_memory_efficient_attention_xformers: bool,
- attention_op: Optional[str]=None, ):
+ self,
+ use_memory_efficient_attention_xformers: bool,
+ attention_op: Optional[str] = None,
+ ):
# remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
# if self.head_size > 128 and attention_op == "flash":
# attention_op = "cutlass"
@@ -409,18 +386,15 @@ def set_use_memory_efficient_attention_xformers(
else:
try:
_ = F.scaled_dot_product_attention_(
- paddle.randn(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.randn(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.randn(
- (1, 1, 2, 40), dtype=paddle.float16),
- attention_op=attention_op, )
+ paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+ attention_op=attention_op,
+ )
except Exception as e:
raise e
- self._use_memory_efficient_attention_xformers = (
- use_memory_efficient_attention_xformers)
+ self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
self._attention_op = attention_op
def forward(self, hidden_states):
@@ -434,14 +408,14 @@ def forward(self, hidden_states):
value_proj = self.value(hidden_states)
query_proj = self.reshape_heads_to_batch_dim(
- query_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ query_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
key_proj = self.reshape_heads_to_batch_dim(
- key_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ key_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
value_proj = self.reshape_heads_to_batch_dim(
- value_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ value_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
if self._use_memory_efficient_attention_xformers:
hidden_states = F.scaled_dot_product_attention_(
@@ -452,19 +426,17 @@ def forward(self, hidden_states):
scale=self.scale,
dropout_p=0.0,
training=self.training,
- attention_op=self._attention_op, )
+ attention_op=self._attention_op,
+ )
else:
- attention_scores = (paddle.matmul(
- query_proj, key_proj, transpose_y=True) * self.scale)
- attention_probs = F.softmax(
- attention_scores.cast("float32"),
- axis=-1).cast(attention_scores.dtype)
+ attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
+ attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
hidden_states = paddle.matmul(attention_probs, value_proj)
# reshape hidden_states
hidden_states = self.reshape_batch_dim_to_heads(
- hidden_states,
- transpose=not self._use_memory_efficient_attention_xformers)
+ hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+ )
# compute next hidden_states
hidden_states = self.proj_attn(hidden_states)
@@ -483,8 +455,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
self.has_conv_skip = in_channels != out_channels
if self.has_conv_skip:
- self.conv_skip = nn.Conv1D(
- in_channels, out_channels, 1, bias_attr=False)
+ self.conv_skip = nn.Conv1D(in_channels, out_channels, 1, bias_attr=False)
self.conv_1 = nn.Conv1D(in_channels, mid_channels, 5, padding=2)
self.group_norm_1 = nn.GroupNorm(1, mid_channels)
@@ -496,8 +467,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
self.gelu_2 = nn.GELU()
def forward(self, hidden_states):
- residual = (self.conv_skip(hidden_states)
- if self.has_conv_skip else hidden_states)
+ residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
hidden_states = self.conv_1(hidden_states)
hidden_states = self.group_norm_1(hidden_states)
@@ -579,7 +549,7 @@ def forward(self, hidden_states, temb=None):
hidden_states = resnet(hidden_states)
hidden_states = attn(hidden_states)
- return hidden_states, (hidden_states, )
+ return hidden_states, (hidden_states,)
class DownBlock1D(nn.Layer):
@@ -602,7 +572,7 @@ def forward(self, hidden_states, temb=None):
for resnet in self.resnets:
hidden_states = resnet(hidden_states)
- return hidden_states, (hidden_states, )
+ return hidden_states, (hidden_states,)
class DownBlock1DNoSkip(nn.Layer):
@@ -623,7 +593,7 @@ def forward(self, hidden_states, temb=None):
for resnet in self.resnets:
hidden_states = resnet(hidden_states)
- return hidden_states, (hidden_states, )
+ return hidden_states, (hidden_states,)
class AttnUpBlock1D(nn.Layer):
@@ -648,8 +618,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
res_hidden_states = res_hidden_states_tuple[-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
for resnet, attn in zip(self.resnets, self.attentions):
hidden_states = resnet(hidden_states)
@@ -676,8 +645,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
res_hidden_states = res_hidden_states_tuple[-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
for resnet in self.resnets:
hidden_states = resnet(hidden_states)
@@ -695,16 +663,14 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
resnets = [
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
ResConvBlock(mid_channels, mid_channels, mid_channels),
- ResConvBlock(
- mid_channels, mid_channels, out_channels, is_last=True),
+ ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
]
self.resnets = nn.LayerList(resnets)
def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
res_hidden_states = res_hidden_states_tuple[-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
for resnet in self.resnets:
hidden_states = resnet(hidden_states)
@@ -713,79 +679,77 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
def get_down_block(
- down_block_type,
- num_layers,
- in_channels,
- out_channels,
- temb_channels,
- add_downsample, ):
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+):
if down_block_type == "DownResnetBlock1D":
return DownResnetBlock1D(
in_channels=in_channels,
num_layers=num_layers,
out_channels=out_channels,
temb_channels=temb_channels,
- add_downsample=add_downsample, )
+ add_downsample=add_downsample,
+ )
elif down_block_type == "DownBlock1D":
return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
elif down_block_type == "AttnDownBlock1D":
- return AttnDownBlock1D(
- out_channels=out_channels, in_channels=in_channels)
+ return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
elif down_block_type == "DownBlock1DNoSkip":
- return DownBlock1DNoSkip(
- out_channels=out_channels, in_channels=in_channels)
+ return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
raise ValueError(f"{down_block_type} does not exist.")
-def get_up_block(up_block_type, num_layers, in_channels, out_channels,
- temb_channels, add_upsample):
+def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
if up_block_type == "UpResnetBlock1D":
return UpResnetBlock1D(
in_channels=in_channels,
num_layers=num_layers,
out_channels=out_channels,
temb_channels=temb_channels,
- add_upsample=add_upsample, )
+ add_upsample=add_upsample,
+ )
elif up_block_type == "UpBlock1D":
return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
elif up_block_type == "AttnUpBlock1D":
return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
elif up_block_type == "UpBlock1DNoSkip":
- return UpBlock1DNoSkip(
- in_channels=in_channels, out_channels=out_channels)
+ return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
raise ValueError(f"{up_block_type} does not exist.")
def get_mid_block(
- mid_block_type,
- num_layers,
- in_channels,
- mid_channels,
- out_channels,
- embed_dim,
- add_downsample, ):
+ mid_block_type,
+ num_layers,
+ in_channels,
+ mid_channels,
+ out_channels,
+ embed_dim,
+ add_downsample,
+):
if mid_block_type == "MidResTemporalBlock1D":
return MidResTemporalBlock1D(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
embed_dim=embed_dim,
- add_downsample=add_downsample, )
+ add_downsample=add_downsample,
+ )
elif mid_block_type == "ValueFunctionMidBlock1D":
- return ValueFunctionMidBlock1D(
- in_channels=in_channels,
- out_channels=out_channels,
- embed_dim=embed_dim)
+ return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
elif mid_block_type == "UNetMidBlock1D":
return UNetMidBlock1D(
in_channels=in_channels,
mid_channels=mid_channels,
- out_channels=out_channels, )
+ out_channels=out_channels,
+ )
raise ValueError(f"{mid_block_type} does not exist.")
-def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels,
- act_fn, fc_dim):
+def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
if out_block_type == "OutConv1DBlock":
return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
elif out_block_type == "ValueFunction":
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py
index c3bcf99332789..f66b21a6a9e50 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d.py
@@ -83,37 +83,40 @@ class conditioning with `class_embed_type` equal to `None`.
@register_to_config
def __init__(
- self,
- sample_size: Optional[Union[int, Tuple[int, int]]]=None,
- in_channels: int=3,
- out_channels: int=3,
- center_input_sample: bool=False,
- time_embedding_type: str="positional",
- freq_shift: int=0,
- flip_sin_to_cos: bool=True,
- down_block_types: Tuple[str]=(
- "DownBlock2D",
- "AttnDownBlock2D",
- "AttnDownBlock2D",
- "AttnDownBlock2D", ),
- up_block_types: Tuple[str]=(
- "AttnUpBlock2D",
- "AttnUpBlock2D",
- "AttnUpBlock2D",
- "UpBlock2D", ),
- block_out_channels: Tuple[int]=(224, 448, 672, 896),
- layers_per_block: int=2,
- mid_block_scale_factor: float=1,
- downsample_padding: int=1,
- act_fn: str="silu",
- attention_head_dim: Optional[int]=8,
- norm_num_groups: int=32,
- norm_eps: float=1e-5,
- resnet_time_scale_shift: str="default",
- add_attention: bool=True,
- class_embed_type: Optional[str]=None,
- num_class_embeds: Optional[int]=None,
- resnet_pre_temb_non_linearity: Optional[bool]=False, ):
+ self,
+ sample_size: Optional[Union[int, Tuple[int, int]]] = None,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ center_input_sample: bool = False,
+ time_embedding_type: str = "positional",
+ freq_shift: int = 0,
+ flip_sin_to_cos: bool = True,
+ down_block_types: Tuple[str] = (
+ "DownBlock2D",
+ "AttnDownBlock2D",
+ "AttnDownBlock2D",
+ "AttnDownBlock2D",
+ ),
+ up_block_types: Tuple[str] = (
+ "AttnUpBlock2D",
+ "AttnUpBlock2D",
+ "AttnUpBlock2D",
+ "UpBlock2D",
+ ),
+ block_out_channels: Tuple[int] = (224, 448, 672, 896),
+ layers_per_block: int = 2,
+ mid_block_scale_factor: float = 1,
+ downsample_padding: int = 1,
+ act_fn: str = "silu",
+ attention_head_dim: Optional[int] = 8,
+ norm_num_groups: int = 32,
+ norm_eps: float = 1e-5,
+ resnet_time_scale_shift: str = "default",
+ add_attention: bool = True,
+ class_embed_type: Optional[str] = None,
+ num_class_embeds: Optional[int] = None,
+ resnet_pre_temb_non_linearity: Optional[bool] = False,
+ ):
super().__init__()
self.sample_size = sample_size
@@ -131,29 +134,23 @@ def __init__(
)
# input
- self.conv_in = nn.Conv2D(
- in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+ self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
# time
if time_embedding_type == "fourier":
- self.time_proj = GaussianFourierProjection(
- embedding_size=block_out_channels[0], scale=16)
+ self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
timestep_input_dim = 2 * block_out_channels[0]
elif time_embedding_type == "positional":
- self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
- freq_shift)
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
- self.time_embedding = TimestepEmbedding(timestep_input_dim,
- time_embed_dim)
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
# class embedding
if class_embed_type is None and num_class_embeds is not None:
- self.class_embedding = nn.Embedding(num_class_embeds,
- time_embed_dim)
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
elif class_embed_type == "timestep":
- self.class_embedding = TimestepEmbedding(timestep_input_dim,
- time_embed_dim)
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
elif class_embed_type == "identity":
self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
else:
@@ -195,7 +192,8 @@ def __init__(
attn_num_head_channels=attention_head_dim,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.down_blocks.append(down_block)
# mid
@@ -209,7 +207,8 @@ def __init__(
attn_num_head_channels=attention_head_dim,
resnet_groups=norm_num_groups,
add_attention=add_attention,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
# up
reversed_block_out_channels = list(reversed(block_out_channels))
@@ -217,8 +216,7 @@ def __init__(
for i, up_block_type in enumerate(up_block_types):
prev_output_channel = output_channel
output_channel = reversed_block_out_channels[i]
- input_channel = reversed_block_out_channels[min(
- i + 1, len(block_out_channels) - 1)]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
is_final_block = i == len(block_out_channels) - 1
@@ -235,27 +233,28 @@ def __init__(
resnet_groups=norm_num_groups,
attn_num_head_channels=attention_head_dim,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
# out
- num_groups_out = (norm_num_groups if norm_num_groups is not None else
- min(block_out_channels[0] // 4, 32))
+ num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[0],
num_groups=num_groups_out,
- epsilon=norm_eps, )
+ epsilon=norm_eps,
+ )
self.conv_act = nn.Silu()
- self.conv_out = nn.Conv2D(
- block_out_channels[0], out_channels, kernel_size=3, padding=1)
+ self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, kernel_size=3, padding=1)
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- class_labels: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[UNet2DOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ class_labels: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[UNet2DOutput, Tuple]:
r"""
Args:
sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -284,7 +283,11 @@ def forward(
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps.expand([sample.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ sample.shape[0],
+ ]
+ )
t_emb = self.time_proj(timesteps)
@@ -296,9 +299,7 @@ def forward(
if self.class_embedding is not None:
if class_labels is None:
- raise ValueError(
- "class_labels should be provided when doing class conditioning"
- )
+ raise ValueError("class_labels should be provided when doing class conditioning")
class_labels = class_labels.cast(self.dtype)
@@ -315,7 +316,7 @@ def forward(
sample = self.conv_in(sample)
# 3. down
- down_block_res_samples = (sample, )
+ down_block_res_samples = (sample,)
if self.resnet_pre_temb_non_linearity:
emb = self.down_resnet_temb_nonlinearity(emb)
@@ -323,10 +324,10 @@ def forward(
for downsample_block in self.down_blocks:
if hasattr(downsample_block, "skip_conv"):
sample, res_samples, skip_sample = downsample_block(
- hidden_states=sample, temb=emb, skip_sample=skip_sample)
+ hidden_states=sample, temb=emb, skip_sample=skip_sample
+ )
else:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=emb)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
down_block_res_samples += res_samples
@@ -336,13 +337,11 @@ def forward(
# 5. up
skip_sample = None
for upsample_block in self.up_blocks:
- res_samples = down_block_res_samples[-len(upsample_block.resnets):]
- down_block_res_samples = down_block_res_samples[:-len(
- upsample_block.resnets)]
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
if hasattr(upsample_block, "skip_conv"):
- sample, skip_sample = upsample_block(sample, res_samples, emb,
- skip_sample)
+ sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
else:
sample = upsample_block(sample, res_samples, emb)
@@ -355,11 +354,10 @@ def forward(
sample += skip_sample
if self.config.time_embedding_type == "fourier":
- timesteps = timesteps.reshape(
- [sample.shape[0], *([1] * len(sample.shape[1:]))])
+ timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))])
sample = sample / timesteps
if not return_dict:
- return (sample, )
+ return (sample,)
return UNet2DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
index b49e5263c2077..5bfa7a33dcbff 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
@@ -22,36 +22,42 @@
from .attention import AdaGroupNorm, AttentionBlock
from .attention_processor import Attention, AttnAddedKVProcessor
from .dual_transformer_2d import DualTransformer2DModel
-from .resnet import (Downsample2D, FirDownsample2D, FirUpsample2D,
- KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D)
+from .resnet import (
+ Downsample2D,
+ FirDownsample2D,
+ FirUpsample2D,
+ KDownsample2D,
+ KUpsample2D,
+ ResnetBlock2D,
+ Upsample2D,
+)
from .transformer_2d import Transformer2DModel
def get_down_block(
- down_block_type,
- num_layers,
- in_channels,
- out_channels,
- temb_channels,
- add_downsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- downsample_padding=None,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default",
- resnet_skip_time_act=False,
- resnet_out_scale_factor=1.0,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity=False, ):
- down_block_type = (down_block_type[7:]
- if down_block_type.startswith("UNetRes") else
- down_block_type)
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity=False,
+):
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
if down_block_type == "DownBlock2D":
return DownBlock2D(
num_layers=num_layers,
@@ -64,7 +70,8 @@ def get_down_block(
resnet_groups=resnet_groups,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "ResnetDownsampleBlock2D":
return ResnetDownsampleBlock2D(
num_layers=num_layers,
@@ -78,7 +85,8 @@ def get_down_block(
resnet_time_scale_shift=resnet_time_scale_shift,
skip_time_act=resnet_skip_time_act,
output_scale_factor=resnet_out_scale_factor,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "AttnDownBlock2D":
return AttnDownBlock2D(
num_layers=num_layers,
@@ -92,11 +100,11 @@ def get_down_block(
downsample_padding=downsample_padding,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "CrossAttnDownBlock2D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnDownBlock2D")
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
return CrossAttnDownBlock2D(
num_layers=num_layers,
in_channels=in_channels,
@@ -114,12 +122,11 @@ def get_down_block(
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "SimpleCrossAttnDownBlock2D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D"
- )
+ raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
return SimpleCrossAttnDownBlock2D(
num_layers=num_layers,
in_channels=in_channels,
@@ -136,7 +143,8 @@ def get_down_block(
output_scale_factor=resnet_out_scale_factor,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "SkipDownBlock2D":
return SkipDownBlock2D(
num_layers=num_layers,
@@ -148,7 +156,8 @@ def get_down_block(
resnet_act_fn=resnet_act_fn,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "AttnSkipDownBlock2D":
return AttnSkipDownBlock2D(
num_layers=num_layers,
@@ -161,7 +170,8 @@ def get_down_block(
downsample_padding=downsample_padding,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "DownEncoderBlock2D":
return DownEncoderBlock2D(
num_layers=num_layers,
@@ -173,7 +183,8 @@ def get_down_block(
resnet_groups=resnet_groups,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "AttnDownEncoderBlock2D":
return AttnDownEncoderBlock2D(
num_layers=num_layers,
@@ -186,7 +197,8 @@ def get_down_block(
downsample_padding=downsample_padding,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "KDownBlock2D":
return KDownBlock2D(
num_layers=num_layers,
@@ -196,7 +208,8 @@ def get_down_block(
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "KCrossAttnDownBlock2D":
return KCrossAttnDownBlock2D(
num_layers=num_layers,
@@ -209,34 +222,35 @@ def get_down_block(
cross_attention_dim=cross_attention_dim,
attn_num_head_channels=attn_num_head_channels,
add_self_attention=True if not add_downsample else False,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
raise ValueError(f"{down_block_type} does not exist.")
def get_up_block(
- up_block_type,
- num_layers,
- in_channels,
- out_channels,
- prev_output_channel,
- temb_channels,
- add_upsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default",
- resnet_skip_time_act=False,
- resnet_out_scale_factor=1.0,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity=False, ):
- up_block_type = (up_block_type[7:]
- if up_block_type.startswith("UNetRes") else up_block_type)
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity=False,
+):
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
if up_block_type == "UpBlock2D":
return UpBlock2D(
num_layers=num_layers,
@@ -249,7 +263,8 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "ResnetUpsampleBlock2D":
return ResnetUpsampleBlock2D(
num_layers=num_layers,
@@ -264,11 +279,11 @@ def get_up_block(
resnet_time_scale_shift=resnet_time_scale_shift,
skip_time_act=resnet_skip_time_act,
output_scale_factor=resnet_out_scale_factor,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "CrossAttnUpBlock2D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnUpBlock2D")
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
return CrossAttnUpBlock2D(
num_layers=num_layers,
in_channels=in_channels,
@@ -286,12 +301,11 @@ def get_up_block(
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "SimpleCrossAttnUpBlock2D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D"
- )
+ raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
return SimpleCrossAttnUpBlock2D(
num_layers=num_layers,
in_channels=in_channels,
@@ -309,7 +323,8 @@ def get_up_block(
output_scale_factor=resnet_out_scale_factor,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "AttnUpBlock2D":
return AttnUpBlock2D(
num_layers=num_layers,
@@ -323,7 +338,8 @@ def get_up_block(
resnet_groups=resnet_groups,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "SkipUpBlock2D":
return SkipUpBlock2D(
num_layers=num_layers,
@@ -335,7 +351,8 @@ def get_up_block(
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "AttnSkipUpBlock2D":
return AttnSkipUpBlock2D(
num_layers=num_layers,
@@ -348,7 +365,8 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "UpDecoderBlock2D":
return UpDecoderBlock2D(
num_layers=num_layers,
@@ -359,7 +377,8 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "AttnUpDecoderBlock2D":
return AttnUpDecoderBlock2D(
num_layers=num_layers,
@@ -371,7 +390,8 @@ def get_up_block(
resnet_groups=resnet_groups,
attn_num_head_channels=attn_num_head_channels,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "KUpBlock2D":
return KUpBlock2D(
num_layers=num_layers,
@@ -381,7 +401,8 @@ def get_up_block(
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "KCrossAttnUpBlock2D":
return KCrossAttnUpBlock2D(
num_layers=num_layers,
@@ -393,30 +414,31 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
cross_attention_dim=cross_attention_dim,
attn_num_head_channels=attn_num_head_channels,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
raise ValueError(f"{up_block_type} does not exist.")
class UNetMidBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- add_attention: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ add_attention: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
self.add_attention = add_attention
# there is always at least one resnet
@@ -432,7 +454,8 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
]
attentions = []
@@ -444,7 +467,9 @@ def __init__(
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
else:
attentions.append(None)
@@ -460,7 +485,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
@@ -477,29 +504,29 @@ def forward(self, hidden_states, temb=None):
class UNetMidBlock2DCrossAttn(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- cross_attention_dim: int=1280,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ cross_attention_dim: int = 1280,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
# there is always at least one resnet
resnets = [
@@ -514,7 +541,8 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
]
attentions = []
@@ -529,7 +557,9 @@ def __init__(
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -538,7 +568,9 @@ def __init__(
in_channels=in_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
resnets.append(
ResnetBlock2D(
in_channels=in_channels,
@@ -551,24 +583,28 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = resnet(hidden_states, temb)
return hidden_states
@@ -576,30 +612,30 @@ def forward(
class UNetMidBlock2DSimpleCrossAttn(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- cross_attention_dim: int=1280,
- skip_time_act=False,
- only_cross_attention=False,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ cross_attention_dim: int = 1280,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
self.num_heads = in_channels // self.attn_num_head_channels
@@ -617,7 +653,8 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
]
attentions = []
@@ -639,7 +676,9 @@ def __init__(
upcast_softmax=True,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- processor=processor, ))
+ processor=processor,
+ )
+ )
resnets.append(
ResnetBlock2D(
in_channels=in_channels,
@@ -653,20 +692,22 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
# attn
@@ -674,7 +715,8 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
# resnet
hidden_states = resnet(hidden_states, temb)
@@ -684,22 +726,23 @@ def forward(
class AttnDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- downsample_padding: int=1,
- add_downsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ downsample_padding: int = 1,
+ add_downsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -718,27 +761,34 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -748,40 +798,41 @@ def forward(self, hidden_states, temb=None):
for resnet, attn in zip(self.resnets, self.attentions):
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class CrossAttnDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- downsample_padding: int=1,
- add_downsample: bool=True,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- only_cross_attention: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ downsample_padding: int = 1,
+ add_downsample: bool = True,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -803,7 +854,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
if not dual_cross_attention:
attentions.append(
Transformer2DModel(
@@ -815,7 +868,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -824,99 +879,103 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None,
- additional_residuals=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ additional_residuals=None,
+ ):
# TODO(Patrick, William) - attention mask is not used
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module, return_dict=None):
def custom_forward(*inputs):
if return_dict is not None:
- return module(
- *inputs, return_dict=return_dict)[
- 0] # move [0] when paddlepaddle <= 2.4.1
+ return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1
else:
return module(*inputs)
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
- cross_attention_kwargs, ) # [0]
+ cross_attention_kwargs,
+ ) # [0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if additional_residuals is not None:
hidden_states += additional_residuals
# westfish: add to align with torch features
- output_states = tuple(output_states[:-1]) + (hidden_states, )
+ output_states = tuple(output_states[:-1]) + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class DownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- downsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -934,19 +993,24 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -956,8 +1020,7 @@ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -965,38 +1028,38 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class DownEncoderBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- downsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -1014,19 +1077,24 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -1043,21 +1111,22 @@ def forward(self, hidden_states):
class AttnDownEncoderBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- downsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -1076,27 +1145,34 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -1114,21 +1190,22 @@ def forward(self, hidden_states):
class AttnSkipDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=np.sqrt(2.0),
- downsample_padding: int=1,
- add_downsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = np.sqrt(2.0),
+ downsample_padding: int = 1,
+ add_downsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.attentions = nn.LayerList([])
self.resnets = nn.LayerList([])
@@ -1148,13 +1225,17 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
- eps=resnet_eps, ))
+ eps=resnet_eps,
+ )
+ )
if add_downsample:
self.resnet_down = ResnetBlock2D(
@@ -1171,12 +1252,10 @@ def __init__(
use_in_shortcut=True,
down=True,
kernel="fir",
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- self.downsamplers = nn.LayerList(
- [FirDownsample2D(
- out_channels, out_channels=out_channels)])
- self.skip_conv = nn.Conv2D(
- 3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
+ self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
else:
self.resnet_down = None
self.downsamplers = None
@@ -1188,7 +1267,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
for resnet, attn in zip(self.resnets, self.attentions):
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
hidden_states = self.resnet_down(hidden_states, temb)
@@ -1197,27 +1276,28 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
hidden_states = self.skip_conv(skip_sample) + hidden_states
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states, skip_sample
class SkipDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_pre_norm: bool=True,
- output_scale_factor: float=np.sqrt(2.0),
- add_downsample: bool=True,
- downsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = np.sqrt(2.0),
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.resnets = nn.LayerList([])
@@ -1236,7 +1316,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
if add_downsample:
self.resnet_down = ResnetBlock2D(
@@ -1253,12 +1335,10 @@ def __init__(
use_in_shortcut=True,
down=True,
kernel="fir",
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- self.downsamplers = nn.LayerList(
- [FirDownsample2D(
- out_channels, out_channels=out_channels)])
- self.skip_conv = nn.Conv2D(
- 3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
+ self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
else:
self.resnet_down = None
self.downsamplers = None
@@ -1269,7 +1349,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
for resnet in self.resnets:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
hidden_states = self.resnet_down(hidden_states, temb)
@@ -1278,28 +1358,29 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
hidden_states = self.skip_conv(skip_sample) + hidden_states
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states, skip_sample
class ResnetDownsampleBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- skip_time_act: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ skip_time_act: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -1318,27 +1399,32 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- ResnetBlock2D(
- in_channels=out_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- skip_time_act=skip_time_act,
- down=True,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ skip_time_act=skip_time_act,
+ down=True,
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -1348,8 +1434,7 @@ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -1357,43 +1442,43 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class SimpleCrossAttnDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- skip_time_act=False,
- only_cross_attention=False,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.has_cross_attention = True
@@ -1419,7 +1504,9 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
# TODO use AttnAddedKVProcessor2_5
# processor = (
# AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
@@ -1437,42 +1524,47 @@ def __init__(
upcast_softmax=True,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- processor=processor, ))
+ processor=processor,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- ResnetBlock2D(
- in_channels=out_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- skip_time_act=skip_time_act,
- down=True,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ skip_time_act=skip_time_act,
+ down=True,
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ ]
+ )
else:
self.downsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
output_states = ()
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
for resnet, attn in zip(self.resnets, self.attentions):
# resnet
@@ -1483,32 +1575,34 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class KDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=4,
- resnet_eps: float=1e-5,
- resnet_act_fn: str="gelu",
- resnet_group_size: int=32,
- add_downsample: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 4,
+ resnet_eps: float = 1e-5,
+ resnet_act_fn: str = "gelu",
+ resnet_group_size: int = 32,
+ add_downsample: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -1529,7 +1623,9 @@ def __init__(
non_linearity=resnet_act_fn,
time_embedding_norm="ada_group",
conv_shortcut_bias=False,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
@@ -1545,8 +1641,7 @@ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -1554,12 +1649,11 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
@@ -1570,20 +1664,21 @@ def custom_forward(*inputs):
class KCrossAttnDownBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- cross_attention_dim: int,
- dropout: float=0.0,
- num_layers: int=4,
- resnet_group_size: int=32,
- add_downsample=True,
- attn_num_head_channels: int=64,
- add_self_attention: bool=False,
- resnet_eps: float=1e-5,
- resnet_act_fn: str="gelu",
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ cross_attention_dim: int,
+ dropout: float = 0.0,
+ num_layers: int = 4,
+ resnet_group_size: int = 32,
+ add_downsample=True,
+ attn_num_head_channels: int = 64,
+ add_self_attention: bool = False,
+ resnet_eps: float = 1e-5,
+ resnet_act_fn: str = "gelu",
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -1607,7 +1702,9 @@ def __init__(
non_linearity=resnet_act_fn,
time_embedding_norm="ada_group",
conv_shortcut_bias=False,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
KAttentionBlock(
out_channels,
@@ -1618,7 +1715,9 @@ def __init__(
attention_bias=True,
add_self_attention=add_self_attention,
cross_attention_norm="layer_norm",
- group_size=resnet_group_size, ))
+ group_size=resnet_group_size,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.attentions = nn.LayerList(attentions)
@@ -1631,17 +1730,17 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module, return_dict=None):
def custom_forward(*inputs):
@@ -1652,15 +1751,14 @@ def custom_forward(*inputs):
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
attention_mask,
- cross_attention_kwargs, )
+ cross_attention_kwargs,
+ )
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -1668,12 +1766,13 @@ def custom_forward(*inputs):
encoder_hidden_states=encoder_hidden_states,
emb=temb,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
if self.downsamplers is None:
- output_states += (None, )
+ output_states += (None,)
else:
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
@@ -1684,29 +1783,29 @@ def custom_forward(*inputs):
class AttnUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -1721,23 +1820,24 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
@@ -1746,8 +1846,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(hidden_states)
@@ -1761,27 +1860,28 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
class CrossAttnUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- only_cross_attention: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -1790,8 +1890,7 @@ def __init__(
self.attn_num_head_channels = attn_num_head_channels
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -1806,7 +1905,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
if not dual_cross_attention:
attentions.append(
Transformer2DModel(
@@ -1818,7 +1919,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -1827,64 +1930,61 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None, ):
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ cross_attention_kwargs=None,
+ upsample_size=None,
+ attention_mask=None,
+ ):
# TODO(Patrick, William) - attention mask is not used
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module, return_dict=None):
def custom_forward(*inputs):
if return_dict is not None:
- return module(
- *inputs, return_dict=return_dict)[0] # move [0]
+ return module(*inputs, return_dict=return_dict)[0] # move [0]
else:
return module(*inputs)
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
- cross_attention_kwargs, ) # [0]
+ cross_attention_kwargs,
+ ) # [0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -1895,27 +1995,27 @@ def custom_forward(*inputs):
class UpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -1930,34 +2030,27 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
self.gradient_checkpointing = False
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- upsample_size=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -1965,8 +2058,7 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
@@ -1979,19 +2071,20 @@ def custom_forward(*inputs):
class UpDecoderBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -2010,15 +2103,14 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
@@ -2035,20 +2127,21 @@ def forward(self, hidden_states):
class AttnUpDecoderBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -2068,23 +2161,24 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
@@ -2102,29 +2196,29 @@ def forward(self, hidden_states):
class AttnSkipUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=np.sqrt(2.0),
- upsample_padding: int=1,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = np.sqrt(2.0),
+ upsample_padding: int = 1,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.attentions = nn.LayerList([])
self.resnets = nn.LayerList([])
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
self.resnets.append(
@@ -2140,14 +2234,18 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions.append(
AttentionBlock(
out_channels,
num_head_channels=attn_num_head_channels,
rescale_output_factor=output_scale_factor,
- eps=resnet_eps, ))
+ eps=resnet_eps,
+ )
+ )
self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
if add_upsample:
@@ -2166,17 +2264,14 @@ def __init__(
use_in_shortcut=True,
up=True,
kernel="fir",
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- self.skip_conv = nn.Conv2D(
- out_channels,
- 3,
- kernel_size=(3, 3),
- stride=(1, 1),
- padding=(1, 1))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.skip_norm = nn.GroupNorm(
num_groups=min(out_channels // 4, 32),
num_channels=out_channels,
- epsilon=resnet_eps, )
+ epsilon=resnet_eps,
+ )
self.act = nn.Silu()
else:
self.resnet_up = None
@@ -2184,17 +2279,12 @@ def __init__(
self.skip_norm = None
self.act = None
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- skip_sample=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
@@ -2219,27 +2309,27 @@ def forward(self,
class SkipUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_pre_norm: bool=True,
- output_scale_factor: float=np.sqrt(2.0),
- add_upsample: bool=True,
- upsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = np.sqrt(2.0),
+ add_upsample: bool = True,
+ upsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.resnets = nn.LayerList([])
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
self.resnets.append(
@@ -2248,15 +2338,16 @@ def __init__(
out_channels=out_channels,
temb_channels=temb_channels,
eps=resnet_eps,
- groups=min((resnet_in_channels + res_skip_channels) // 4,
- 32),
+ groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
groups_out=min(out_channels // 4, 32),
dropout=dropout,
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
if add_upsample:
@@ -2275,17 +2366,14 @@ def __init__(
use_in_shortcut=True,
up=True,
kernel="fir",
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- self.skip_conv = nn.Conv2D(
- out_channels,
- 3,
- kernel_size=(3, 3),
- stride=(1, 1),
- padding=(1, 1))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.skip_norm = nn.GroupNorm(
num_groups=min(out_channels // 4, 32),
num_channels=out_channels,
- epsilon=resnet_eps, )
+ epsilon=resnet_eps,
+ )
self.act = nn.Silu()
else:
self.resnet_up = None
@@ -2293,17 +2381,12 @@ def __init__(
self.skip_norm = None
self.act = None
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- skip_sample=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
@@ -2326,28 +2409,28 @@ def forward(self,
class ResnetUpsampleBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- skip_time_act=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ skip_time_act=False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -2363,46 +2446,45 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- ResnetBlock2D(
- in_channels=out_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- skip_time_act=skip_time_act,
- up=True,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- ])
+ self.upsamplers = nn.LayerList(
+ [
+ ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ skip_time_act=skip_time_act,
+ up=True,
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ ]
+ )
else:
self.upsamplers = None
self.gradient_checkpointing = False
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- upsample_size=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -2410,8 +2492,7 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
@@ -2424,26 +2505,27 @@ def custom_forward(*inputs):
class SimpleCrossAttnUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- skip_time_act=False,
- only_cross_attention=False,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -2454,8 +2536,7 @@ def __init__(
self.num_heads = out_channels // self.attn_num_head_channels
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -2471,7 +2552,9 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
# TODO support AttnAddedKVProcessor2_5
# processor = (
# AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
@@ -2489,50 +2572,54 @@ def __init__(
upcast_softmax=True,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- processor=processor, ))
+ processor=processor,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- ResnetBlock2D(
- in_channels=out_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- skip_time_act=skip_time_act,
- up=True,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
- ])
+ self.upsamplers = nn.LayerList(
+ [
+ ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ skip_time_act=skip_time_act,
+ up=True,
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ ]
+ )
else:
self.upsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- upsample_size=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ upsample_size=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
for resnet, attn in zip(self.resnets, self.attentions):
# resnet
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
@@ -2541,7 +2628,8 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -2552,17 +2640,18 @@ def forward(
class KUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=5,
- resnet_eps: float=1e-5,
- resnet_act_fn: str="gelu",
- resnet_group_size: Optional[int]=32,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 5,
+ resnet_eps: float = 1e-5,
+ resnet_act_fn: str = "gelu",
+ resnet_group_size: Optional[int] = 32,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
k_in_channels = 2 * out_channels
@@ -2577,8 +2666,7 @@ def __init__(
resnets.append(
ResnetBlock2D(
in_channels=in_channels,
- out_channels=k_out_channels
- if (i == num_layers - 1) else out_channels,
+ out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
temb_channels=temb_channels,
eps=resnet_eps,
groups=groups,
@@ -2587,7 +2675,9 @@ def __init__(
non_linearity=resnet_act_fn,
time_embedding_norm="ada_group",
conv_shortcut_bias=False,
- pre_norm=resnet_pre_temb_non_linearity, ))
+ pre_norm=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
@@ -2598,19 +2688,13 @@ def __init__(
self.gradient_checkpointing = False
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- upsample_size=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
res_hidden_states_tuple = res_hidden_states_tuple[-1]
if res_hidden_states_tuple is not None:
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states_tuple], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
for resnet in self.resnets:
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -2618,8 +2702,7 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
@@ -2632,20 +2715,21 @@ def custom_forward(*inputs):
class KCrossAttnUpBlock2D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=4,
- resnet_eps: float=1e-5,
- resnet_act_fn: str="gelu",
- resnet_group_size: int=32,
- attn_num_head_channels=1, # attention dim_head
- cross_attention_dim: int=768,
- add_upsample: bool=True,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 4,
+ resnet_eps: float = 1e-5,
+ resnet_act_fn: str = "gelu",
+ resnet_group_size: int = 32,
+ attn_num_head_channels=1, # attention dim_head
+ cross_attention_dim: int = 768,
+ add_upsample: bool = True,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -2686,20 +2770,24 @@ def __init__(
non_linearity=resnet_act_fn,
time_embedding_norm="ada_group",
conv_shortcut_bias=False,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
attentions.append(
KAttentionBlock(
k_out_channels if (i == num_layers - 1) else out_channels,
k_out_channels // attn_num_head_channels
- if (i == num_layers - 1) else out_channels //
- attn_num_head_channels,
+ if (i == num_layers - 1)
+ else out_channels // attn_num_head_channels,
attn_num_head_channels,
cross_attention_dim=cross_attention_dim,
temb_channels=temb_channels,
attention_bias=True,
add_self_attention=add_self_attention,
cross_attention_norm="layer_norm",
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.attentions = nn.LayerList(attentions)
@@ -2712,42 +2800,39 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None, ):
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ cross_attention_kwargs=None,
+ upsample_size=None,
+ attention_mask=None,
+ ):
res_hidden_states_tuple = res_hidden_states_tuple[-1]
if res_hidden_states_tuple is not None:
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states_tuple], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
for resnet, attn in zip(self.resnets, self.attentions):
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module, return_dict=None):
def custom_forward(*inputs):
if return_dict is not None:
- return module(
- *inputs, return_dict=return_dict)[0] # move [0]
+ return module(*inputs, return_dict=return_dict)[0] # move [0]
else:
return module(*inputs)
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
attention_mask,
- cross_attention_kwargs, ) # [0]
+ cross_attention_kwargs,
+ ) # [0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -2755,7 +2840,8 @@ def custom_forward(*inputs):
encoder_hidden_states=encoder_hidden_states,
emb=temb,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -2783,25 +2869,25 @@ class KAttentionBlock(nn.Layer):
"""
def __init__(
- self,
- dim: int,
- num_attention_heads: int,
- attention_head_dim: int,
- dropout: float=0.0,
- cross_attention_dim: Optional[int]=None,
- attention_bias: bool=False,
- upcast_attention: bool=False,
- temb_channels: int=768, # for ada_group_norm
- add_self_attention: bool=False,
- cross_attention_norm: Optional[str]=None,
- group_size: int=32, ):
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout: float = 0.0,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ upcast_attention: bool = False,
+ temb_channels: int = 768, # for ada_group_norm
+ add_self_attention: bool = False,
+ cross_attention_norm: Optional[str] = None,
+ group_size: int = 32,
+ ):
super().__init__()
self.add_self_attention = add_self_attention
# 1. Self-Attn
if add_self_attention:
- self.norm1 = AdaGroupNorm(temb_channels, dim,
- max(1, dim // group_size))
+ self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
@@ -2809,7 +2895,8 @@ def __init__(
dropout=dropout,
bias=attention_bias,
cross_attention_dim=None,
- cross_attention_norm=None, )
+ cross_attention_norm=None,
+ )
# 2. Cross-Attn
self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
@@ -2821,25 +2908,24 @@ def __init__(
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
- cross_attention_norm=cross_attention_norm, )
+ cross_attention_norm=cross_attention_norm,
+ )
def _to_3d(self, hidden_states, height, weight):
- return hidden_states.transpose([0, 2, 3, 1]).reshape(
- [hidden_states.shape[0], height * weight, -1])
+ return hidden_states.transpose([0, 2, 3, 1]).reshape([hidden_states.shape[0], height * weight, -1])
def _to_4d(self, hidden_states, height, weight):
- return hidden_states.transpose([0, 2, 1]).reshape(
- [hidden_states.shape[0], -1, height, weight])
+ return hidden_states.transpose([0, 2, 1]).reshape([hidden_states.shape[0], -1, height, weight])
def forward(
- self,
- hidden_states,
- encoder_hidden_states=None,
- emb=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ emb=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
# 1. Self-Attention
if self.add_self_attention:
@@ -2851,7 +2937,8 @@ def forward(
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=None,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
attn_output = self._to_4d(attn_output, height, weight)
hidden_states = attn_output + hidden_states
@@ -2864,7 +2951,8 @@ def forward(
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
attn_output = self._to_4d(attn_output, height, weight)
hidden_states = attn_output + hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
index 173a1185da9a8..606a6f0b91ba5 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
@@ -23,13 +23,23 @@
from ..loaders import UNet2DConditionLoadersMixin
from ..utils import NEG_INF, BaseOutput, logging
from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import (GaussianFourierProjection, TextTimeEmbedding,
- TimestepEmbedding, Timesteps)
+from .embeddings import (
+ GaussianFourierProjection,
+ TextTimeEmbedding,
+ TimestepEmbedding,
+ Timesteps,
+)
from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (CrossAttnDownBlock2D, CrossAttnUpBlock2D,
- DownBlock2D, UNetMidBlock2DCrossAttn,
- UNetMidBlock2DSimpleCrossAttn, UpBlock2D,
- get_down_block, get_up_block)
+from .unet_2d_blocks import (
+ CrossAttnDownBlock2D,
+ CrossAttnUpBlock2D,
+ DownBlock2D,
+ UNetMidBlock2DCrossAttn,
+ UNetMidBlock2DSimpleCrossAttn,
+ UpBlock2D,
+ get_down_block,
+ get_up_block,
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -45,8 +55,7 @@ class UNet2DConditionOutput(BaseOutput):
sample: paddle.Tensor
-class UNet2DConditionModel(ModelMixin, ConfigMixin,
- UNet2DConditionLoadersMixin):
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
r"""
UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
and returns sample shaped output.
@@ -126,57 +135,60 @@ class conditioning with `class_embed_type` equal to `None`.
@register_to_config
def __init__(
- self,
- sample_size: Optional[int]=None,
- in_channels: int=4,
- out_channels: int=4,
- center_input_sample: bool=False,
- flip_sin_to_cos: bool=True,
- freq_shift: int=0,
- down_block_types: Tuple[str]=(
- "CrossAttnDownBlock2D",
- "CrossAttnDownBlock2D",
- "CrossAttnDownBlock2D",
- "DownBlock2D", ),
- mid_block_type: Optional[str]="UNetMidBlock2DCrossAttn",
- up_block_types: Tuple[str]=(
- "UpBlock2D",
- "CrossAttnUpBlock2D",
- "CrossAttnUpBlock2D",
- "CrossAttnUpBlock2D", ),
- only_cross_attention: Union[bool, Tuple[bool]]=False,
- block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
- layers_per_block: Union[int, Tuple[int]]=2,
- downsample_padding: int=1,
- mid_block_scale_factor: float=1,
- act_fn: str="silu",
- norm_num_groups: Optional[int]=32,
- norm_eps: float=1e-5,
- cross_attention_dim: Union[int, Tuple[int]]=1280,
- encoder_hid_dim: Optional[int]=None,
- attention_head_dim: Union[int, Tuple[int]]=8,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- class_embed_type: Optional[str]=None,
- addition_embed_type: Optional[str]=None,
- num_class_embeds: Optional[int]=None,
- upcast_attention: bool=False,
- resnet_time_scale_shift: str="default",
- resnet_skip_time_act: bool=False,
- resnet_out_scale_factor: int=1.0,
- time_embedding_type: str="positional", # fourier, positional
- time_embedding_dim: Optional[int]=None,
- time_embedding_act_fn: Optional[str]=None,
- timestep_post_act: Optional[str]=None,
- time_cond_proj_dim: Optional[int]=None,
- conv_in_kernel: int=3,
- conv_out_kernel: int=3,
- projection_class_embeddings_input_dim: Optional[int]=None,
- class_embeddings_concat: bool=False,
- mid_block_only_cross_attention: Optional[bool]=None,
- cross_attention_norm: Optional[str]=None,
- resnet_pre_temb_non_linearity: Optional[bool]=False,
- addition_embed_type_num_heads: int=64, ):
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ center_input_sample: bool = False,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "DownBlock2D",
+ ),
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+ up_block_types: Tuple[str] = (
+ "UpBlock2D",
+ "CrossAttnUpBlock2D",
+ "CrossAttnUpBlock2D",
+ "CrossAttnUpBlock2D",
+ ),
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: Union[int, Tuple[int]] = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: Optional[int] = 32,
+ norm_eps: float = 1e-5,
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
+ encoder_hid_dim: Optional[int] = None,
+ attention_head_dim: Union[int, Tuple[int]] = 8,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ class_embed_type: Optional[str] = None,
+ addition_embed_type: Optional[str] = None,
+ num_class_embeds: Optional[int] = None,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ resnet_skip_time_act: bool = False,
+ resnet_out_scale_factor: int = 1.0,
+ time_embedding_type: str = "positional", # fourier, positional
+ time_embedding_dim: Optional[int] = None,
+ time_embedding_act_fn: Optional[str] = None,
+ timestep_post_act: Optional[str] = None,
+ time_cond_proj_dim: Optional[int] = None,
+ conv_in_kernel: int = 3,
+ conv_out_kernel: int = 3,
+ projection_class_embeddings_input_dim: Optional[int] = None,
+ class_embeddings_concat: bool = False,
+ mid_block_only_cross_attention: Optional[bool] = None,
+ cross_attention_norm: Optional[str] = None,
+ resnet_pre_temb_non_linearity: Optional[bool] = False,
+ addition_embed_type_num_heads: int = 64,
+ ):
super().__init__()
self.sample_size = sample_size
@@ -192,30 +204,22 @@ def __init__(
f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- only_cross_attention,
- bool) and len(only_cross_attention) != len(down_block_types):
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- attention_head_dim,
- int) and len(attention_head_dim) != len(down_block_types):
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
)
- if isinstance(
- cross_attention_dim,
- list) and len(cross_attention_dim) != len(down_block_types):
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- layers_per_block,
- int) and len(layers_per_block) != len(down_block_types):
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
)
@@ -226,26 +230,25 @@ def __init__(
in_channels,
block_out_channels[0],
kernel_size=conv_in_kernel,
- padding=conv_in_padding, )
+ padding=conv_in_padding,
+ )
# time
if time_embedding_type == "fourier":
time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
if time_embed_dim % 2 != 0:
- raise ValueError(
- f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
- )
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
self.time_proj = GaussianFourierProjection(
time_embed_dim // 2,
set_W_to_weight=False,
log=False,
- flip_sin_to_cos=flip_sin_to_cos, )
+ flip_sin_to_cos=flip_sin_to_cos,
+ )
timestep_input_dim = time_embed_dim
elif time_embedding_type == "positional":
time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
- self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
- freq_shift)
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
else:
raise ValueError(
@@ -257,21 +260,19 @@ def __init__(
time_embed_dim,
act_fn=act_fn,
post_act_fn=timestep_post_act,
- cond_proj_dim=time_cond_proj_dim, )
+ cond_proj_dim=time_cond_proj_dim,
+ )
if encoder_hid_dim is not None:
- self.encoder_hid_proj = nn.Linear(encoder_hid_dim,
- cross_attention_dim)
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
else:
self.encoder_hid_proj = None
# class embedding
if class_embed_type is None and num_class_embeds is not None:
- self.class_embedding = nn.Embedding(num_class_embeds,
- time_embed_dim) # int64
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) # int64
elif class_embed_type == "timestep":
- self.class_embedding = TimestepEmbedding(
- timestep_input_dim, time_embed_dim, act_fn=act_fn) # float
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn) # float
elif class_embed_type == "identity":
self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
elif class_embed_type == "projection":
@@ -286,15 +287,13 @@ def __init__(
# Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
# When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
# As a result, `TimestepEmbedding` can be passed arbitrary vectors.
- self.class_embedding = TimestepEmbedding(
- projection_class_embeddings_input_dim, time_embed_dim) # float
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) # float
elif class_embed_type == "simple_projection":
if projection_class_embeddings_input_dim is None:
raise ValueError(
"`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
)
- self.class_embedding = nn.Linear(
- projection_class_embeddings_input_dim, time_embed_dim)
+ self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
else:
self.class_embedding = None
@@ -307,11 +306,10 @@ def __init__(
self.add_embedding = TextTimeEmbedding(
text_time_embedding_from_dim,
time_embed_dim,
- num_heads=addition_embed_type_num_heads, )
- elif addition_embed_type is not None:
- raise ValueError(
- f"addition_embed_type: {addition_embed_type} must be None or 'text'."
+ num_heads=addition_embed_type_num_heads,
)
+ elif addition_embed_type is not None:
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
if time_embedding_act_fn is None:
self.time_embed_act = None
@@ -324,8 +322,7 @@ def __init__(
elif time_embedding_act_fn == "gelu":
self.time_embed_act = nn.GELU()
else:
- raise ValueError(
- f"Unsupported activation function: {time_embedding_act_fn}")
+ raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
self.down_blocks = nn.LayerList([])
self.up_blocks = nn.LayerList([])
@@ -333,18 +330,16 @@ def __init__(
if isinstance(only_cross_attention, bool):
if mid_block_only_cross_attention is None:
mid_block_only_cross_attention = only_cross_attention
- only_cross_attention = [only_cross_attention] * len(
- down_block_types)
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
if mid_block_only_cross_attention is None:
mid_block_only_cross_attention = False
if isinstance(attention_head_dim, int):
- attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
if isinstance(cross_attention_dim, int):
- cross_attention_dim = (
- cross_attention_dim, ) * len(down_block_types)
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
if isinstance(layers_per_block, int):
layers_per_block = [layers_per_block] * len(down_block_types)
@@ -397,7 +392,8 @@ def __init__(
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.down_blocks.append(down_block)
# mid
@@ -415,7 +411,8 @@ def __init__(
dual_cross_attention=dual_cross_attention,
use_linear_projection=use_linear_projection,
upcast_attention=upcast_attention,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
self.mid_block = UNetMidBlock2DSimpleCrossAttn(
in_channels=block_out_channels[-1],
@@ -430,7 +427,8 @@ def __init__(
skip_time_act=resnet_skip_time_act,
only_cross_attention=mid_block_only_cross_attention,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif mid_block_type is None:
self.mid_block = None
else:
@@ -452,8 +450,7 @@ def __init__(
prev_output_channel = output_channel
output_channel = reversed_block_out_channels[i]
- input_channel = reversed_block_out_channels[min(
- i + 1, len(block_out_channels) - 1)]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
# add upsample block for all BUT final layer
if not is_final_block:
@@ -483,7 +480,8 @@ def __init__(
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
@@ -492,7 +490,8 @@ def __init__(
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[0],
num_groups=norm_num_groups,
- epsilon=norm_eps, )
+ epsilon=norm_eps,
+ )
if act_fn == "swish":
self.conv_act = lambda x: F.silu(x)
elif act_fn == "mish":
@@ -512,7 +511,8 @@ def __init__(
block_out_channels[0],
out_channels,
kernel_size=conv_out_kernel,
- padding=conv_out_padding, )
+ padding=conv_out_padding,
+ )
@property
def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -524,16 +524,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
# set recursively
processors = {}
- def fn_recursive_add_processors(
- name: str,
- module: nn.Layer,
- processors: Dict[str, AttentionProcessor]):
+ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
if hasattr(module, "set_processor"):
processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
- fn_recursive_add_processors(f"{name}.{sub_name}", child,
- processors)
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
return processors
@@ -542,9 +538,7 @@ def fn_recursive_add_processors(
return processors
- def set_attn_processor(self,
- processor: Union[AttentionProcessor, Dict[
- str, AttentionProcessor]]):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Parameters:
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -569,8 +563,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
- fn_recursive_attn_processor(f"{name}.{sub_name}", child,
- processor)
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
@@ -618,8 +611,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
# make smallest slice possible
slice_size = num_sliceable_layers * [1]
- slice_size = (num_sliceable_layers * [slice_size]
- if not isinstance(slice_size, list) else slice_size)
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
@@ -631,14 +623,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
size = slice_size[i]
dim = sliceable_head_dims[i]
if size is not None and size > dim:
- raise ValueError(
- f"size {size} has to be smaller or equal to {dim}.")
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
# Recursively walk through all the children.
# Any children which exposes the set_attention_slice method
# gets the message
- def fn_recursive_set_attention_slice(module: nn.Layer,
- slice_size: List[int]):
+ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
if hasattr(module, "set_attention_slice"):
module.set_attention_slice(slice_size.pop())
@@ -650,24 +640,22 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
fn_recursive_set_attention_slice(module, reversed_slice_size)
def _set_gradient_checkpointing(self, module, value=False):
- if isinstance(
- module,
- (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+ if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
module.gradient_checkpointing = value
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- down_block_additional_residuals: Optional[Tuple[
- paddle.Tensor]]=None,
- mid_block_additional_residual: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[UNet2DConditionOutput, Tuple]:
r"""
Args:
sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -699,8 +687,7 @@ def forward(
upsample_size = None
if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
- logger.info(
- "Forward upsample size to force interpolation output size.")
+ logger.info("Forward upsample size to force interpolation output size.")
forward_upsample_size = True
# prepare attention_mask
@@ -720,7 +707,11 @@ def forward(
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps.expand([sample.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ sample.shape[0],
+ ]
+ )
t_emb = self.time_proj(timesteps)
# `Timesteps` does not contain any weights and will always return f32 tensors
@@ -732,8 +723,7 @@ def forward(
if self.class_embedding is not None:
if class_labels is None:
- raise ValueError(
- "class_labels should be provided when num_class_embeds > 0")
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
# maybe cast it to float16
class_labels = class_labels.cast(self.dtype)
@@ -771,21 +761,16 @@ def forward(
# 3. down
- is_controlnet = (mid_block_additional_residual is not None and
- down_block_additional_residuals is not None)
- is_adapter = (mid_block_additional_residual is None and
- down_block_additional_residuals is not None)
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+ is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
- down_block_res_samples = (sample, )
+ down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
- if (hasattr(downsample_block, "has_cross_attention") and
- downsample_block.has_cross_attention):
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
additional_kwargs = {}
if is_adapter and len(down_block_additional_residuals) > 0:
- additional_kwargs[
- "additional_residuals"] = down_block_additional_residuals.pop(
- 0)
+ additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
sample, res_samples = downsample_block(
hidden_states=sample,
@@ -793,25 +778,25 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
- **additional_kwargs, )
+ **additional_kwargs,
+ )
else:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=emb)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
if is_adapter and len(down_block_additional_residuals) > 0:
sample += down_block_additional_residuals.pop(0)
# westfish: add to align with torch features
- res_samples = tuple(res_samples[:-1]) + (sample, )
+ res_samples = tuple(res_samples[:-1]) + (sample,)
down_block_res_samples += res_samples
if is_controlnet:
new_down_block_res_samples = ()
for down_block_res_sample, down_block_additional_residual in zip(
- down_block_res_samples, down_block_additional_residuals):
- down_block_res_sample = (
- down_block_res_sample + down_block_additional_residual)
- new_down_block_res_samples += (down_block_res_sample, )
+ down_block_res_samples, down_block_additional_residuals
+ ):
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
+ new_down_block_res_samples += (down_block_res_sample,)
down_block_res_samples = new_down_block_res_samples
# 4. mid
@@ -821,7 +806,8 @@ def forward(
emb,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
if is_controlnet:
sample = sample + mid_block_additional_residual
@@ -830,17 +816,15 @@ def forward(
for i, upsample_block in enumerate(self.up_blocks):
is_final_block = i == len(self.up_blocks) - 1
- res_samples = down_block_res_samples[-len(upsample_block.resnets):]
- down_block_res_samples = down_block_res_samples[:-len(
- upsample_block.resnets)]
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
# if we have not reached the final block and need to forward the
# upsample size, we do it here
if not is_final_block and forward_upsample_size:
upsample_size = down_block_res_samples[-1].shape[2:]
- if (hasattr(upsample_block, "has_cross_attention") and
- upsample_block.has_cross_attention):
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
sample = upsample_block(
hidden_states=sample,
temb=emb,
@@ -848,13 +832,15 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
upsample_size=upsample_size,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
else:
sample = upsample_block(
hidden_states=sample,
temb=emb,
res_hidden_states_tuple=res_samples,
- upsample_size=upsample_size, )
+ upsample_size=upsample_size,
+ )
# 6. post-process
if self.conv_norm_out:
@@ -863,6 +849,6 @@ def forward(
sample = self.conv_out(sample)
if not return_dict:
- return (sample, )
+ return (sample,)
return UNet2DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
index f3feb516342c7..5e55038b49714 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
@@ -22,23 +22,24 @@
def get_down_block(
- down_block_type,
- num_layers,
- in_channels,
- out_channels,
- temb_channels,
- add_downsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- downsample_padding=None,
- dual_cross_attention=False,
- use_linear_projection=True,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default", ):
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+ dual_cross_attention=False,
+ use_linear_projection=True,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+):
if down_block_type == "DownBlock3D":
return DownBlock3D(
num_layers=num_layers,
@@ -50,11 +51,11 @@ def get_down_block(
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
downsample_padding=downsample_padding,
- resnet_time_scale_shift=resnet_time_scale_shift, )
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
elif down_block_type == "CrossAttnDownBlock3D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnDownBlock3D")
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
return CrossAttnDownBlock3D(
num_layers=num_layers,
in_channels=in_channels,
@@ -71,28 +72,30 @@ def get_down_block(
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
- resnet_time_scale_shift=resnet_time_scale_shift, )
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
raise ValueError(f"{down_block_type} does not exist.")
def get_up_block(
- up_block_type,
- num_layers,
- in_channels,
- out_channels,
- prev_output_channel,
- temb_channels,
- add_upsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- dual_cross_attention=False,
- use_linear_projection=True,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default", ):
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ dual_cross_attention=False,
+ use_linear_projection=True,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+):
if up_block_type == "UpBlock3D":
return UpBlock3D(
num_layers=num_layers,
@@ -104,11 +107,11 @@ def get_up_block(
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
- resnet_time_scale_shift=resnet_time_scale_shift, )
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
elif up_block_type == "CrossAttnUpBlock3D":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnUpBlock3D")
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
return CrossAttnUpBlock3D(
num_layers=num_layers,
in_channels=in_channels,
@@ -125,33 +128,34 @@ def get_up_block(
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
- resnet_time_scale_shift=resnet_time_scale_shift, )
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
raise ValueError(f"{up_block_type} does not exist.")
class UNetMidBlock3DCrossAttn(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-06,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels=1,
- output_scale_factor=1.0,
- cross_attention_dim=1280,
- dual_cross_attention=False,
- use_linear_projection=True,
- upcast_attention=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ output_scale_factor=1.0,
+ cross_attention_dim=1280,
+ dual_cross_attention=False,
+ use_linear_projection=True,
+ upcast_attention=False,
+ ):
super().__init__()
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
# there is always at least one resnet
resnets = [
ResnetBlock2D(
@@ -164,13 +168,15 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, )
+ pre_norm=resnet_pre_norm,
+ )
]
temp_convs = [
TemporalConvLayer(
in_channels,
in_channels,
- dropout=0.1, )
+ dropout=0.1,
+ )
]
attentions = []
temp_attentions = []
@@ -184,7 +190,9 @@ def __init__(
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
temp_attentions.append(
TransformerTemporalModel(
in_channels // attn_num_head_channels,
@@ -192,7 +200,9 @@ def __init__(
in_channels=in_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
resnets.append(
ResnetBlock2D(
in_channels=in_channels,
@@ -204,38 +214,45 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, ))
+ pre_norm=resnet_pre_norm,
+ )
+ )
temp_convs.append(
TemporalConvLayer(
in_channels,
in_channels,
- dropout=0.1, ))
+ dropout=0.1,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.temp_convs = nn.LayerList(temp_convs)
self.attentions = nn.LayerList(attentions)
self.temp_attentions = nn.LayerList(temp_attentions)
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- num_frames=1,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ num_frames=1,
+ cross_attention_kwargs=None,
+ ):
hidden_states = self.resnets[0](hidden_states, temb)
hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
for attn, temp_attn, resnet, temp_conv in zip(
- self.attentions, self.temp_attentions, self.resnets[1:],
- self.temp_convs[1:]):
+ self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+ ):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = temp_attn(
hidden_states,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
return hidden_states
@@ -243,26 +260,27 @@ def forward(
class CrossAttnDownBlock3D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-06,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels=1,
- cross_attention_dim=1280,
- output_scale_factor=1.0,
- downsample_padding=1,
- add_downsample=True,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ cross_attention_dim=1280,
+ output_scale_factor=1.0,
+ downsample_padding=1,
+ add_downsample=True,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -283,12 +301,16 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, ))
+ pre_norm=resnet_pre_norm,
+ )
+ )
temp_convs.append(
TemporalConvLayer(
out_channels,
out_channels,
- dropout=0.1, ))
+ dropout=0.1,
+ )
+ )
attentions.append(
Transformer2DModel(
out_channels // attn_num_head_channels,
@@ -299,7 +321,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
temp_attentions.append(
TransformerTemporalModel(
out_channels // attn_num_head_channels,
@@ -307,70 +331,79 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.temp_convs = nn.LayerList(temp_convs)
self.attentions = nn.LayerList(attentions)
self.temp_attentions = nn.LayerList(temp_attentions)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- num_frames=1,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ num_frames=1,
+ cross_attention_kwargs=None,
+ ):
output_states = ()
for resnet, temp_conv, attn, temp_attn in zip(
- self.resnets, self.temp_convs, self.attentions,
- self.temp_attentions):
+ self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+ ):
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = temp_attn(
hidden_states,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
- output_states += (hidden_states, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class DownBlock3D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-06,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor=1.0,
- add_downsample=True,
- downsample_padding=1, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_downsample=True,
+ downsample_padding=1,
+ ):
super().__init__()
resnets = []
temp_convs = []
@@ -387,23 +420,30 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, ))
+ pre_norm=resnet_pre_norm,
+ )
+ )
temp_convs.append(
TemporalConvLayer(
out_channels,
out_channels,
- dropout=0.1, ))
+ dropout=0.1,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.temp_convs = nn.LayerList(temp_convs)
if add_downsample:
- self.downsamplers = nn.LayerList([
- Downsample2D(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ Downsample2D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
self.gradient_checkpointing = False
@@ -413,36 +453,37 @@ def forward(self, hidden_states, temb=None, num_frames=1):
for resnet, temp_conv in zip(self.resnets, self.temp_convs):
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
class CrossAttnUpBlock3D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-06,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels=1,
- cross_attention_dim=1280,
- output_scale_factor=1.0,
- add_upsample=True,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ cross_attention_dim=1280,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ ):
super().__init__()
resnets = []
temp_convs = []
@@ -451,8 +492,7 @@ def __init__(
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
ResnetBlock2D(
@@ -465,12 +505,16 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, ))
+ pre_norm=resnet_pre_norm,
+ )
+ )
temp_convs.append(
TemporalConvLayer(
out_channels,
out_channels,
- dropout=0.1, ))
+ dropout=0.1,
+ )
+ )
attentions.append(
Transformer2DModel(
out_channels // attn_num_head_channels,
@@ -481,7 +525,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
temp_attentions.append(
TransformerTemporalModel(
out_channels // attn_num_head_channels,
@@ -489,48 +535,51 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.resnets = nn.LayerList(sublayers=resnets)
self.temp_convs = nn.LayerList(sublayers=temp_convs)
self.attentions = nn.LayerList(sublayers=attentions)
self.temp_attentions = nn.LayerList(sublayers=temp_attentions)
if add_upsample:
- self.upsamplers = nn.LayerList(sublayers=[
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList(
+ sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+ )
else:
self.upsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- upsample_size=None,
- attention_mask=None,
- num_frames=1,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ upsample_size=None,
+ attention_mask=None,
+ num_frames=1,
+ cross_attention_kwargs=None,
+ ):
for resnet, temp_conv, attn, temp_attn in zip(
- self.resnets, self.temp_convs, self.attentions,
- self.temp_attentions):
+ self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+ ):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = temp_attn(
hidden_states,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
if self.upsamplers is not None:
for upsampler in self.upsamplers:
hidden_states = upsampler(hidden_states, upsample_size)
@@ -539,26 +588,26 @@ def forward(
class UpBlock3D(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-06,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor=1.0,
- add_upsample=True, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
super().__init__()
resnets = []
temp_convs = []
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
ResnetBlock2D(
@@ -571,36 +620,37 @@ def __init__(
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm, ))
+ pre_norm=resnet_pre_norm,
+ )
+ )
temp_convs.append(
TemporalConvLayer(
out_channels,
out_channels,
- dropout=0.1, ))
+ dropout=0.1,
+ )
+ )
self.resnets = nn.LayerList(resnets)
self.temp_convs = nn.LayerList(temp_convs)
if add_upsample:
- self.upsamplers = nn.LayerList([
- Upsample2D(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- upsample_size=None,
- num_frames=1, ):
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ upsample_size=None,
+ num_frames=1,
+ ):
for resnet, temp_conv in zip(self.resnets, self.temp_convs):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- x=[hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
if self.upsamplers is not None:
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
index fb8ae5756d4c3..038e8c6d514a7 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
@@ -26,9 +26,15 @@
from .embeddings import TimestepEmbedding, Timesteps
from .modeling_utils import ModelMixin
from .transformer_temporal import TransformerTemporalModel
-from .unet_3d_blocks import (CrossAttnDownBlock3D, CrossAttnUpBlock3D,
- DownBlock3D, UNetMidBlock3DCrossAttn, UpBlock3D,
- get_down_block, get_up_block)
+from .unet_3d_blocks import (
+ CrossAttnDownBlock3D,
+ CrossAttnUpBlock3D,
+ DownBlock3D,
+ UNetMidBlock3DCrossAttn,
+ UpBlock3D,
+ get_down_block,
+ get_up_block,
+)
logger = logging.get_logger(__name__)
@@ -44,8 +50,7 @@ class UNet3DConditionOutput(BaseOutput):
sample: paddle.Tensor
-class UNet3DConditionModel(ModelMixin, ConfigMixin,
- UNet2DConditionLoadersMixin):
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
r"""
UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
and returns sample shaped output.
@@ -79,29 +84,32 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin,
@register_to_config
def __init__(
- self,
- sample_size: Optional[int]=None,
- in_channels: int=4,
- out_channels: int=4,
- down_block_types: Tuple[str]=(
- "CrossAttnDownBlock3D",
- "CrossAttnDownBlock3D",
- "CrossAttnDownBlock3D",
- "DownBlock3D", ),
- up_block_types: Tuple[str]=(
- "UpBlock3D",
- "CrossAttnUpBlock3D",
- "CrossAttnUpBlock3D",
- "CrossAttnUpBlock3D", ),
- block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
- layers_per_block: int=2,
- downsample_padding: int=1,
- mid_block_scale_factor: float=1,
- act_fn: str="silu",
- norm_num_groups: Optional[int]=32,
- norm_eps: float=1e-05,
- cross_attention_dim: int=1024,
- attention_head_dim: Union[int, Tuple[int]]=64, ):
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "DownBlock3D",
+ ),
+ up_block_types: Tuple[str] = (
+ "UpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ ),
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: int = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: Optional[int] = 32,
+ norm_eps: float = 1e-05,
+ cross_attention_dim: int = 1024,
+ attention_head_dim: Union[int, Tuple[int]] = 64,
+ ):
super().__init__()
self.sample_size = sample_size
# Check inputs
@@ -113,9 +121,7 @@ def __init__(
raise ValueError(
f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- attention_head_dim,
- int) and len(attention_head_dim) != len(down_block_types):
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
raise ValueError(
f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
)
@@ -126,7 +132,8 @@ def __init__(
in_channels=in_channels,
out_channels=block_out_channels[0],
kernel_size=conv_in_kernel,
- padding=conv_in_padding, )
+ padding=conv_in_padding,
+ )
# time
time_embed_dim = block_out_channels[0] * 4
self.time_proj = Timesteps(block_out_channels[0], True, 0)
@@ -134,17 +141,19 @@ def __init__(
self.time_embedding = TimestepEmbedding(
timestep_input_dim,
time_embed_dim,
- act_fn=act_fn, )
+ act_fn=act_fn,
+ )
self.transformer_in = TransformerTemporalModel(
num_attention_heads=8,
attention_head_dim=attention_head_dim,
in_channels=block_out_channels[0],
- num_layers=1, )
+ num_layers=1,
+ )
# class embedding
self.down_blocks = nn.LayerList(sublayers=[])
self.up_blocks = nn.LayerList(sublayers=[])
if isinstance(attention_head_dim, int):
- attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
# down
output_channel = block_out_channels[0]
@@ -165,7 +174,8 @@ def __init__(
cross_attention_dim=cross_attention_dim,
attn_num_head_channels=attention_head_dim[i],
downsample_padding=downsample_padding,
- dual_cross_attention=False, )
+ dual_cross_attention=False,
+ )
self.down_blocks.append(down_block)
# mid
self.mid_block = UNetMidBlock3DCrossAttn(
@@ -177,7 +187,8 @@ def __init__(
cross_attention_dim=cross_attention_dim,
attn_num_head_channels=attention_head_dim[-1],
resnet_groups=norm_num_groups,
- dual_cross_attention=False, )
+ dual_cross_attention=False,
+ )
# count how many layers upsample the images
self.num_upsamplers = 0
# up
@@ -188,8 +199,7 @@ def __init__(
is_final_block = i == len(block_out_channels) - 1
prev_output_channel = output_channel
output_channel = reversed_block_out_channels[i]
- input_channel = reversed_block_out_channels[min(
- i + 1, len(block_out_channels) - 1)]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
# add upsample block for all BUT final layer
if not is_final_block:
add_upsample = True
@@ -209,14 +219,16 @@ def __init__(
resnet_groups=norm_num_groups,
cross_attention_dim=cross_attention_dim,
attn_num_head_channels=reversed_attention_head_dim[i],
- dual_cross_attention=False, )
+ dual_cross_attention=False,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
if norm_num_groups is not None:
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[0],
num_groups=norm_num_groups,
- epsilon=norm_eps, )
+ epsilon=norm_eps,
+ )
self.conv_act = nn.Silu()
else:
self.conv_norm_out = None
@@ -226,7 +238,8 @@ def __init__(
in_channels=block_out_channels[0],
out_channels=out_channels,
kernel_size=conv_out_kernel,
- padding=conv_out_padding, )
+ padding=conv_out_padding,
+ )
@property
# Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
@@ -239,16 +252,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
# set recursively
processors = {}
- def fn_recursive_add_processors(
- name: str,
- module: nn.Layer,
- processors: Dict[str, AttentionProcessor]):
+ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
if hasattr(module, "set_processor"):
processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
- fn_recursive_add_processors(f"{name}.{sub_name}", child,
- processors)
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
return processors
@@ -295,8 +304,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
# make smallest slice possible
slice_size = num_sliceable_layers * [1]
- slice_size = (num_sliceable_layers * [slice_size]
- if not isinstance(slice_size, list) else slice_size)
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
@@ -308,14 +316,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
size = slice_size[i]
dim = sliceable_head_dims[i]
if size is not None and size > dim:
- raise ValueError(
- f"size {size} has to be smaller or equal to {dim}.")
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
# Recursively walk through all the children.
# Any children which exposes the set_attention_slice method
# gets the message
- def fn_recursive_set_attention_slice(module: nn.Layer,
- slice_size: List[int]):
+ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
if hasattr(module, "set_attention_slice"):
module.set_attention_slice(slice_size.pop())
@@ -327,9 +333,7 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
fn_recursive_set_attention_slice(module, reversed_slice_size)
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(self,
- processor: Union[AttentionProcessor, Dict[
- str, AttentionProcessor]]):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Parameters:
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -354,8 +358,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
- fn_recursive_attn_processor(f"{name}.{sub_name}", child,
- processor)
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
@@ -368,24 +371,22 @@ def set_default_attn_processor(self):
self.set_attn_processor(AttnProcessor())
def _set_gradient_checkpointing(self, module, value=False):
- if isinstance(
- module,
- (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+ if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
module.gradient_checkpointing = value
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- down_block_additional_residuals: Optional[Tuple[
- paddle.Tensor]]=None,
- mid_block_additional_residual: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[UNet3DConditionOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[UNet3DConditionOutput, Tuple]:
"""
Args:
sample (`paddle.Tensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
@@ -417,8 +418,7 @@ def forward(
upsample_size = None
if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
- logger.info(
- "Forward upsample size to force interpolation output size.")
+ logger.info("Forward upsample size to force interpolation output size.")
forward_upsample_size = True
# prepare attention_mask
if attention_mask is not None:
@@ -436,7 +436,11 @@ def forward(
elif len(timesteps.shape) == 0:
timesteps = timesteps[None]
num_frames = sample.shape[2]
- timesteps = timesteps.expand([sample.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ sample.shape[0],
+ ]
+ )
t_emb = self.time_proj(timesteps)
# timesteps does not contain any weights and will always return f32 tensors
@@ -445,38 +449,36 @@ def forward(
t_emb = t_emb.cast(dtype=self.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
emb = emb.repeat_interleave(repeats=num_frames, axis=0)
- encoder_hidden_states = encoder_hidden_states.repeat_interleave(
- repeats=num_frames, axis=0)
- sample = sample.transpose([0, 2, 1, 3, 4]).reshape((sample.shape[
- 0] * num_frames, -1) + tuple(sample.shape[3:]))
+ encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
+ sample = sample.transpose([0, 2, 1, 3, 4]).reshape(
+ (sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:])
+ )
sample = self.conv_in(sample)
sample = self.transformer_in(
- sample,
- num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs).sample
+ sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
+ ).sample
# 3. down
- down_block_res_samples = (sample, )
+ down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
- if (hasattr(downsample_block, "has_cross_attention") and
- downsample_block.has_cross_attention):
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
sample, res_samples = downsample_block(
hidden_states=sample,
temb=emb,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
else:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=emb, num_frames=num_frames)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
down_block_res_samples += res_samples
if down_block_additional_residuals is not None:
new_down_block_res_samples = ()
for down_block_res_sample, down_block_additional_residual in zip(
- down_block_res_samples, down_block_additional_residuals):
- down_block_res_sample = (
- down_block_res_sample + down_block_additional_residual)
- new_down_block_res_samples += (down_block_res_sample, )
+ down_block_res_samples, down_block_additional_residuals
+ ):
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
+ new_down_block_res_samples += (down_block_res_sample,)
down_block_res_samples = new_down_block_res_samples
# 4. mid
if self.mid_block is not None:
@@ -486,21 +488,20 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
if mid_block_additional_residual is not None:
sample = sample + mid_block_additional_residual
# 5. up
for i, upsample_block in enumerate(self.up_blocks):
is_final_block = i == len(self.up_blocks) - 1
- res_samples = down_block_res_samples[-len(upsample_block.resnets):]
- down_block_res_samples = down_block_res_samples[:-len(
- upsample_block.resnets)]
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
# if we have not reached the final block and need to forward the
# upsample size, we do it here
if not is_final_block and forward_upsample_size:
upsample_size = down_block_res_samples[-1].shape[2:]
- if (hasattr(upsample_block, "has_cross_attention") and
- upsample_block.has_cross_attention):
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
sample = upsample_block(
hidden_states=sample,
temb=emb,
@@ -509,23 +510,23 @@ def forward(
upsample_size=upsample_size,
attention_mask=attention_mask,
num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
else:
sample = upsample_block(
hidden_states=sample,
temb=emb,
res_hidden_states_tuple=res_samples,
upsample_size=upsample_size,
- num_frames=num_frames, )
+ num_frames=num_frames,
+ )
# 6. post-process
if self.conv_norm_out:
sample = self.conv_norm_out(sample)
sample = self.conv_act(sample)
sample = self.conv_out(sample)
# reshape to (batch, channel, framerate, width, height)
- sample = (sample[None, :]
- .reshape((-1, num_frames) + tuple(sample.shape[1:]))
- .transpose([0, 2, 1, 3, 4]))
+ sample = sample[None, :].reshape((-1, num_frames) + tuple(sample.shape[1:])).transpose([0, 2, 1, 3, 4])
if not return_dict:
- return (sample, )
+ return (sample,)
return UNet3DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/uvit.py b/ppdiffusers/ppdiffusers/models/uvit.py
index f2140122e269f..eb7267d41d2a2 100644
--- a/ppdiffusers/ppdiffusers/models/uvit.py
+++ b/ppdiffusers/ppdiffusers/models/uvit.py
@@ -27,21 +27,15 @@
def unpatchify(x, in_chans):
- patch_size = int((x.shape[2] // in_chans)**0.5)
- h = w = int(x.shape[1]**0.5)
- assert h * w == x.shape[1] and patch_size**2 * in_chans == x.shape[2]
- x = einops.rearrange(
- x,
- "B (h w) (p1 p2 C) -> B C (h p1) (w p2)",
- h=h,
- p1=patch_size,
- p2=patch_size)
+ patch_size = int((x.shape[2] // in_chans) ** 0.5)
+ h = w = int(x.shape[1] ** 0.5)
+ assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2]
+ x = einops.rearrange(x, "B (h w) (p1 p2 C) -> B C (h p1) (w p2)", h=h, p1=patch_size, p2=patch_size)
return x
def interpolate_pos_emb(pos_emb, old_shape, new_shape):
- pos_emb = einops.rearrange(
- pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1])
+ pos_emb = einops.rearrange(pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1])
pos_emb = F.interpolate(pos_emb, new_shape, mode="bilinear")
pos_emb = einops.rearrange(pos_emb, "B C H W -> B (H W) C")
return pos_emb
@@ -49,13 +43,14 @@ def interpolate_pos_emb(pos_emb, old_shape, new_shape):
class Attention(nn.Layer):
def __init__(
- self,
- dim,
- num_heads=8,
- qkv_bias=False,
- qk_scale=None,
- attn_drop=0.0,
- proj_drop=0.0, ):
+ self,
+ dim,
+ num_heads=8,
+ qkv_bias=False,
+ qk_scale=None,
+ attn_drop=0.0,
+ proj_drop=0.0,
+ ):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
@@ -82,9 +77,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True):
return tensor
def set_use_memory_efficient_attention_xformers(
- self,
- use_memory_efficient_attention_xformers: bool,
- attention_op: Optional[str]=None, ):
+ self,
+ use_memory_efficient_attention_xformers: bool,
+ attention_op: Optional[str] = None,
+ ):
# remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
# if self.head_size > 128 and attention_op == "flash":
# attention_op = "cutlass"
@@ -96,18 +92,15 @@ def set_use_memory_efficient_attention_xformers(
else:
try:
_ = F.scaled_dot_product_attention_(
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- attention_op=attention_op, )
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ attention_op=attention_op,
+ )
except Exception as e:
raise e
- self._use_memory_efficient_attention_xformers = (
- use_memory_efficient_attention_xformers)
+ self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
self._attention_op = attention_op
def forward(self, x):
@@ -116,14 +109,14 @@ def forward(self, x):
qkv = qkv.cast(paddle.float32)
query_proj, key_proj, value_proj = qkv.chunk(3, axis=-1)
query_proj = self.reshape_heads_to_batch_dim(
- query_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ query_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
key_proj = self.reshape_heads_to_batch_dim(
- key_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ key_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
value_proj = self.reshape_heads_to_batch_dim(
- value_proj,
- transpose=not self._use_memory_efficient_attention_xformers)
+ value_proj, transpose=not self._use_memory_efficient_attention_xformers
+ )
if self._use_memory_efficient_attention_xformers:
hidden_states = F.scaled_dot_product_attention_(
@@ -134,18 +127,17 @@ def forward(self, x):
scale=self.scale,
dropout_p=self.attn_drop,
training=self.training,
- attention_op=self._attention_op, )
+ attention_op=self._attention_op,
+ )
else:
with paddle.amp.auto_cast(enable=False):
- attention_scores = paddle.matmul(
- query_proj * self.scale, key_proj, transpose_y=True)
+ attention_scores = paddle.matmul(query_proj * self.scale, key_proj, transpose_y=True)
attention_probs = F.softmax(attention_scores, axis=-1)
- hidden_states = paddle.matmul(attention_probs,
- value_proj).cast(x.dtype)
+ hidden_states = paddle.matmul(attention_probs, value_proj).cast(x.dtype)
hidden_states = self.reshape_batch_dim_to_heads(
- hidden_states,
- transpose=not self._use_memory_efficient_attention_xformers)
+ hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+ )
hidden_states = self.proj_drop(self.proj(hidden_states))
return hidden_states
@@ -153,18 +145,19 @@ def forward(self, x):
class Block(nn.Layer):
def __init__(
- self,
- dim,
- num_heads,
- mlp_ratio=4.0,
- qkv_bias=False,
- qk_scale=None,
- drop=0.0,
- attn_drop=0.0,
- drop_path=0.0,
- act_layer=nn.GELU,
- norm_layer=nn.LayerNorm,
- skip=False, ):
+ self,
+ dim,
+ num_heads,
+ mlp_ratio=4.0,
+ qkv_bias=False,
+ qk_scale=None,
+ drop=0.0,
+ attn_drop=0.0,
+ drop_path=0.0,
+ act_layer=nn.GELU,
+ norm_layer=nn.LayerNorm,
+ skip=False,
+ ):
super().__init__()
self.norm1 = norm_layer(dim) if skip else None
self.norm2 = norm_layer(dim)
@@ -175,16 +168,17 @@ def __init__(
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
- proj_drop=drop, )
- self.drop_path = DropPath(
- drop_path) if drop_path > 0.0 else nn.Identity()
+ proj_drop=drop,
+ )
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.norm3 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
- drop=drop, )
+ drop=drop,
+ )
self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
def forward(self, x, skip=None):
@@ -223,44 +217,43 @@ class UViTModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- sample_size=1,
- img_size=64,
- in_channels=4,
- patch_size=2,
- embed_dim=1536,
- depth=30,
- num_heads=24,
- mlp_ratio=4.0,
- qkv_bias=False,
- qk_scale=None,
- pos_drop_rate=0.0,
- drop_rate=0.0,
- attn_drop_rate=0.0,
- norm_type="layer_norm",
- text_dim=64,
- num_text_tokens=77,
- clip_img_dim=512,
- use_checkpoint=False, ):
+ self,
+ sample_size=1,
+ img_size=64,
+ in_channels=4,
+ patch_size=2,
+ embed_dim=1536,
+ depth=30,
+ num_heads=24,
+ mlp_ratio=4.0,
+ qkv_bias=False,
+ qk_scale=None,
+ pos_drop_rate=0.0,
+ drop_rate=0.0,
+ attn_drop_rate=0.0,
+ norm_type="layer_norm",
+ text_dim=64,
+ num_text_tokens=77,
+ clip_img_dim=512,
+ use_checkpoint=False,
+ ):
super().__init__()
self.sample_size = sample_size
self.in_channels = in_channels
self.patch_size = patch_size
self.embed_dim = embed_dim
- self.img_size = (img_size, img_size) if isinstance(img_size,
- int) else img_size
+ self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size
self.patch_embed = PatchEmbed(
height=self.img_size[0],
width=self.img_size[1],
patch_size=patch_size,
in_channels=in_channels,
embed_dim=embed_dim,
- add_pos_embed=False, )
- assert self.img_size[0] % patch_size == 0 and self.img_size[
- 1] % patch_size == 0
- self.num_patches = (self.img_size[0] // patch_size) * (
- self.img_size[1] // patch_size)
+ add_pos_embed=False,
+ )
+ assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0
+ self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size)
self.encode_prefix = nn.Linear(768, text_dim)
@@ -274,22 +267,27 @@ def __init__(
self.pos_embed = self.create_parameter(
shape=(1, self.num_tokens, embed_dim),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
assert norm_type == "layer_norm", "We only support norm_type == layer_norm. "
norm_layer = nn.LayerNorm
self.pos_drop = nn.Dropout(p=pos_drop_rate)
- self.in_blocks = nn.LayerList([
- Block(
- dim=embed_dim,
- num_heads=num_heads,
- mlp_ratio=mlp_ratio,
- qkv_bias=qkv_bias,
- qk_scale=qk_scale,
- drop=drop_rate,
- attn_drop=attn_drop_rate,
- norm_layer=norm_layer, ) for _ in range(depth // 2)
- ])
+ self.in_blocks = nn.LayerList(
+ [
+ Block(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ norm_layer=norm_layer,
+ )
+ for _ in range(depth // 2)
+ ]
+ )
self.mid_block = Block(
dim=embed_dim,
@@ -299,20 +297,25 @@ def __init__(
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
- norm_layer=norm_layer, )
-
- self.out_blocks = nn.LayerList([
- Block(
- dim=embed_dim,
- num_heads=num_heads,
- mlp_ratio=mlp_ratio,
- qkv_bias=qkv_bias,
- qk_scale=qk_scale,
- drop=drop_rate,
- attn_drop=attn_drop_rate,
- norm_layer=norm_layer,
- skip=True, ) for _ in range(depth // 2)
- ])
+ norm_layer=norm_layer,
+ )
+
+ self.out_blocks = nn.LayerList(
+ [
+ Block(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ norm_layer=norm_layer,
+ skip=True,
+ )
+ for _ in range(depth // 2)
+ ]
+ )
self.norm = norm_layer(embed_dim)
self.patch_dim = patch_size**2 * in_channels
@@ -320,18 +323,18 @@ def __init__(
self.token_embedding = nn.Embedding(2, embed_dim)
self.pos_embed_token = self.create_parameter(
- shape=(1, 1, embed_dim),
- default_initializer=nn.initializer.Constant(0.0))
+ shape=(1, 1, embed_dim), default_initializer=nn.initializer.Constant(0.0)
+ )
def forward(
- self,
- img: paddle.Tensor,
- clip_img: paddle.Tensor,
- text: paddle.Tensor,
- t_img: paddle.Tensor,
- t_text: paddle.Tensor,
- data_type: paddle.Tensor,
- return_dict=False, # TODO: nf
+ self,
+ img: paddle.Tensor,
+ clip_img: paddle.Tensor,
+ text: paddle.Tensor,
+ t_img: paddle.Tensor,
+ t_text: paddle.Tensor,
+ data_type: paddle.Tensor,
+ return_dict=False, # TODO: nf
):
_, _, H, W = img.shape
# TODO junnyu, support float16
@@ -343,10 +346,8 @@ def forward(
clip_img = self.clip_img_embed(clip_img)
text = self.text_embed(text)
- t_img_token = get_timestep_embedding(t_img, self.embed_dim, True,
- 0).unsqueeze(axis=1)
- t_text_token = get_timestep_embedding(t_text, self.embed_dim, True,
- 0).unsqueeze(axis=1)
+ t_img_token = get_timestep_embedding(t_img, self.embed_dim, True, 0).unsqueeze(axis=1)
+ t_text_token = get_timestep_embedding(t_text, self.embed_dim, True, 0).unsqueeze(axis=1)
token_embed = self.token_embedding(data_type).unsqueeze(axis=1)
# TODO junnyu, support float16
@@ -354,35 +355,35 @@ def forward(
t_text_token = t_text_token.cast(self.dtype)
token_embed = token_embed.cast(self.dtype)
- x = paddle.concat(
- (t_img_token, t_text_token, token_embed, text, clip_img, img),
- axis=1)
+ x = paddle.concat((t_img_token, t_text_token, token_embed, text, clip_img, img), axis=1)
num_text_tokens, num_img_tokens = text.shape[1], img.shape[1]
pos_embed = paddle.concat(
[
- self.pos_embed[:, :1 + 1, :],
+ self.pos_embed[:, : 1 + 1, :],
self.pos_embed_token,
- self.pos_embed[:, 1 + 1:, :],
+ self.pos_embed[:, 1 + 1 :, :],
],
- axis=1, )
+ axis=1,
+ )
if H == self.img_size[0] and W == self.img_size[1]:
pass
else:
# interpolate the positional embedding when the input image is not of the default shape
pos_embed_others, pos_embed_patches = paddle.split(
- pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches],
- axis=1)
+ pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], axis=1
+ )
pos_embed_patches = interpolate_pos_emb(
pos_embed_patches,
(
self.img_size[0] // self.patch_size,
- self.img_size[1] // self.patch_size, ),
- (H // self.patch_size, W // self.patch_size), )
- pos_embed = paddle.concat(
- (pos_embed_others, pos_embed_patches), axis=1)
+ self.img_size[1] // self.patch_size,
+ ),
+ (H // self.patch_size, W // self.patch_size),
+ )
+ pos_embed = paddle.concat((pos_embed_others, pos_embed_patches), axis=1)
x = x + pos_embed
x = self.pos_drop(x)
@@ -405,8 +406,8 @@ def forward(
token_embed_out,
text_out,
clip_img_out,
- img_out, ) = x.split(
- (1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1)
+ img_out,
+ ) = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1)
img_out = self.decoder_pred(img_out)
sample_img = unpatchify(img_out, self.in_channels)
@@ -419,4 +420,5 @@ def forward(
return UViTModelOutput(
sample_img=sample_img,
sample_clip_img=sample_clip_img,
- sample_text=sample_text, )
+ sample_text=sample_text,
+ )
diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py
index f3b9a81b43a67..4b1fce10910a6 100644
--- a/ppdiffusers/ppdiffusers/models/vae.py
+++ b/ppdiffusers/ppdiffusers/models/vae.py
@@ -53,24 +53,20 @@ class DecoderOutput(BaseOutput):
class Encoder(nn.Layer):
def __init__(
- self,
- in_channels=3,
- out_channels=3,
- down_block_types=("DownEncoderBlock2D", ),
- block_out_channels=(64, ),
- layers_per_block=2,
- norm_num_groups=32,
- act_fn="silu",
- double_z=True, ):
+ self,
+ in_channels=3,
+ out_channels=3,
+ down_block_types=("DownEncoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ double_z=True,
+ ):
super().__init__()
self.layers_per_block = layers_per_block
- self.conv_in = nn.Conv2D(
- in_channels,
- block_out_channels[0],
- kernel_size=3,
- stride=1,
- padding=1)
+ self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
self.mid_block = None
self.down_blocks = nn.LayerList([])
@@ -93,7 +89,8 @@ def __init__(
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
attn_num_head_channels=None,
- temb_channels=None, )
+ temb_channels=None,
+ )
self.down_blocks.append(down_block)
# mid
@@ -105,18 +102,19 @@ def __init__(
resnet_time_scale_shift="default",
attn_num_head_channels=None,
resnet_groups=norm_num_groups,
- temb_channels=None, )
+ temb_channels=None,
+ )
# out
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[-1],
num_groups=norm_num_groups,
- epsilon=1e-6, )
+ epsilon=1e-6,
+ )
self.conv_act = nn.Silu()
conv_out_channels = 2 * out_channels if double_z else out_channels
- self.conv_out = nn.Conv2D(
- block_out_channels[-1], conv_out_channels, 3, padding=1)
+ self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1)
self.gradient_checkpointing = False
def forward(self, x):
@@ -156,23 +154,19 @@ def custom_forward(*inputs):
class Decoder(nn.Layer):
def __init__(
- self,
- in_channels=3,
- out_channels=3,
- up_block_types=("UpDecoderBlock2D", ),
- block_out_channels=(64, ),
- layers_per_block=2,
- norm_num_groups=32,
- act_fn="silu", ):
+ self,
+ in_channels=3,
+ out_channels=3,
+ up_block_types=("UpDecoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ ):
super().__init__()
self.layers_per_block = layers_per_block
- self.conv_in = nn.Conv2D(
- in_channels,
- block_out_channels[-1],
- kernel_size=3,
- stride=1,
- padding=1)
+ self.conv_in = nn.Conv2D(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
self.mid_block = None
self.up_blocks = nn.LayerList([])
@@ -186,7 +180,8 @@ def __init__(
resnet_time_scale_shift="default",
attn_num_head_channels=None,
resnet_groups=norm_num_groups,
- temb_channels=None, )
+ temb_channels=None,
+ )
# up
reversed_block_out_channels = list(reversed(block_out_channels))
@@ -208,18 +203,15 @@ def __init__(
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
attn_num_head_channels=None,
- temb_channels=None, )
+ temb_channels=None,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
# out
- self.conv_norm_out = nn.GroupNorm(
- num_channels=block_out_channels[0],
- num_groups=norm_num_groups,
- epsilon=1e-6)
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=1e-6)
self.conv_act = nn.Silu()
- self.conv_out = nn.Conv2D(
- block_out_channels[0], out_channels, 3, padding=1)
+ self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1)
self.gradient_checkpointing = False
def forward(self, z):
@@ -255,8 +247,7 @@ def custom_forward(*inputs):
# (TODO, junnyu) check nan
# clamp inf values to enable fp16 training
- if (amp_state() or
- sample.dtype == paddle.float16) and paddle.isinf(sample).any():
+ if (amp_state() or sample.dtype == paddle.float16) and paddle.isinf(sample).any():
clamp_value = finfo(sample.dtype).max - 1000
sample = paddle.clip(sample, min=-clamp_value, max=clamp_value)
@@ -278,14 +269,15 @@ class VectorQuantizer(nn.Layer):
# backwards compatibility we use the buggy version by default, but you can
# specify legacy=False to fix it.
def __init__(
- self,
- n_e,
- vq_embed_dim,
- beta,
- remap=None,
- unknown_index="random",
- sane_index_shape=False,
- legacy=True, ):
+ self,
+ n_e,
+ vq_embed_dim,
+ beta,
+ remap=None,
+ unknown_index="random",
+ sane_index_shape=False,
+ legacy=True,
+ ):
super().__init__()
self.n_e = n_e
self.vq_embed_dim = vq_embed_dim
@@ -306,8 +298,10 @@ def __init__(
if self.unknown_index == "extra":
self.unknown_index = self.re_embed
self.re_embed = self.re_embed + 1
- print(f"Remapping {self.n_e} indices to {self.re_embed} indices. "
- f"Using {self.unknown_index} for unknown indices.")
+ print(
+ f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+ f"Using {self.unknown_index} for unknown indices."
+ )
else:
self.re_embed = n_e
@@ -322,8 +316,7 @@ def remap_to_used(self, inds):
new = match.argmax(-1)
unknown = match.sum(2) < 1
if self.unknown_index == "random":
- new[unknown] = paddle.randint(
- 0, self.re_embed, shape=new[unknown].shape)
+ new[unknown] = paddle.randint(0, self.re_embed, shape=new[unknown].shape)
else:
new[unknown] = self.unknown_index
return new.reshape(ishape)
@@ -335,8 +328,7 @@ def unmap_to_all(self, inds):
used = self.used.cast(inds.dtype)
if self.re_embed > self.used.shape[0]: # extra token
inds[inds >= self.used.shape[0]] = 0 # simply set to zero
- back = paddle.take_along_axis(
- used[None, :][inds.shape[0] * [0], :], inds, axis=1)
+ back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1)
return back.reshape(ishape)
def forward(self, z):
@@ -345,9 +337,11 @@ def forward(self, z):
z_flattened = z.reshape([-1, self.vq_embed_dim])
# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
- d = (paddle.sum(z_flattened**2, axis=1, keepdim=True) + paddle.sum(
- self.embedding.weight**2, axis=1) - 2 * paddle.matmul(
- z_flattened, self.embedding.weight, transpose_y=True))
+ d = (
+ paddle.sum(z_flattened**2, axis=1, keepdim=True)
+ + paddle.sum(self.embedding.weight**2, axis=1)
+ - 2 * paddle.matmul(z_flattened, self.embedding.weight, transpose_y=True)
+ )
min_encoding_indices = paddle.argmin(d, axis=1)
z_q = self.embedding(min_encoding_indices).reshape(z.shape)
@@ -356,11 +350,9 @@ def forward(self, z):
# compute loss for embedding
if not self.legacy:
- loss = self.beta * paddle.mean((z_q.detach() - z)**2) + paddle.mean(
- (z_q - z.detach())**2)
+ loss = self.beta * paddle.mean((z_q.detach() - z) ** 2) + paddle.mean((z_q - z.detach()) ** 2)
else:
- loss = paddle.mean((z_q.detach() - z)**2) + self.beta * paddle.mean(
- (z_q - z.detach())**2)
+ loss = paddle.mean((z_q.detach() - z) ** 2) + self.beta * paddle.mean((z_q - z.detach()) ** 2)
# preserve gradients
z_q = z + (z_q - z).detach()
@@ -369,15 +361,12 @@ def forward(self, z):
z_q = z_q.transpose([0, 3, 1, 2])
if self.remap is not None:
- min_encoding_indices = min_encoding_indices.reshape(
- [z.shape[0], -1]) # add batch axis
+ min_encoding_indices = min_encoding_indices.reshape([z.shape[0], -1]) # add batch axis
min_encoding_indices = self.remap_to_used(min_encoding_indices)
- min_encoding_indices = min_encoding_indices.reshape(
- [-1, 1]) # flatten
+ min_encoding_indices = min_encoding_indices.reshape([-1, 1]) # flatten
if self.sane_index_shape:
- min_encoding_indices = min_encoding_indices.reshape(
- [z_q.shape[0], z_q.shape[2], z_q.shape[3]])
+ min_encoding_indices = min_encoding_indices.reshape([z_q.shape[0], z_q.shape[2], z_q.shape[3]])
return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
@@ -386,7 +375,11 @@ def get_codebook_entry(self, indices, shape):
if self.remap is not None:
indices = indices.reshape([shape[0], -1]) # add batch axis
indices = self.unmap_to_all(indices)
- indices = indices.reshape([-1, ]) # flatten again
+ indices = indices.reshape(
+ [
+ -1,
+ ]
+ ) # flatten again
# get quantized latent vectors
z_q = self.embedding(indices)
@@ -408,14 +401,11 @@ def __init__(self, parameters, deterministic=False):
self.std = paddle.exp(0.5 * self.logvar)
self.var = paddle.exp(self.logvar)
if self.deterministic:
- self.var = self.std = paddle.zeros_like(
- self.mean, dtype=self.parameters.dtype)
+ self.var = self.std = paddle.zeros_like(self.mean, dtype=self.parameters.dtype)
- def sample(self,
- generator: Optional[paddle.Generator]=None) -> paddle.Tensor:
+ def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor:
# make sure sample is on the same device as the parameters and has same dtype
- sample = randn_tensor(
- self.mean.shape, generator=generator, dtype=self.parameters.dtype)
+ sample = randn_tensor(self.mean.shape, generator=generator, dtype=self.parameters.dtype)
x = self.mean + self.std * sample
return x
@@ -426,21 +416,26 @@ def kl(self, other=None):
if other is None:
return 0.5 * paddle.sum(
paddle.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
- axis=[1, 2, 3], )
+ axis=[1, 2, 3],
+ )
else:
return 0.5 * paddle.sum(
- paddle.pow(self.mean - other.mean, 2) / other.var + self.var
- / other.var - 1.0 - self.logvar + other.logvar,
- axis=[1, 2, 3], )
+ paddle.pow(self.mean - other.mean, 2) / other.var
+ + self.var / other.var
+ - 1.0
+ - self.logvar
+ + other.logvar,
+ axis=[1, 2, 3],
+ )
def nll(self, sample, axis=[1, 2, 3]):
if self.deterministic:
return paddle.to_tensor([0.0])
logtwopi = np.log(2.0 * np.pi)
return 0.5 * paddle.sum(
- logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) /
- self.var,
- axis=axis, )
+ logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / self.var,
+ axis=axis,
+ )
def mode(self):
return self.mean
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
index 87a07653649cd..8104816e90486 100644
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ b/ppdiffusers/ppdiffusers/models/vq_model.py
@@ -69,20 +69,21 @@ class VQModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- in_channels: int=3,
- out_channels: int=3,
- down_block_types: Tuple[str]=("DownEncoderBlock2D", ),
- up_block_types: Tuple[str]=("UpDecoderBlock2D", ),
- block_out_channels: Tuple[int]=(64, ),
- layers_per_block: int=1,
- act_fn: str="silu",
- latent_channels: int=3,
- sample_size: int=32,
- num_vq_embeddings: int=256,
- norm_num_groups: int=32,
- vq_embed_dim: Optional[int]=None,
- scaling_factor: float=0.18215, ):
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+ block_out_channels: Tuple[int] = (64,),
+ layers_per_block: int = 1,
+ act_fn: str = "silu",
+ latent_channels: int = 3,
+ sample_size: int = 32,
+ num_vq_embeddings: int = 256,
+ norm_num_groups: int = 32,
+ vq_embed_dim: Optional[int] = None,
+ scaling_factor: float = 0.18215,
+ ):
super().__init__()
# pass init params to Encoder
@@ -94,7 +95,8 @@ def __init__(
layers_per_block=layers_per_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
- double_z=False, )
+ double_z=False,
+ )
vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
@@ -104,7 +106,8 @@ def __init__(
vq_embed_dim,
beta=0.25,
remap=None,
- sane_index_shape=False, )
+ sane_index_shape=False,
+ )
self.post_quant_conv = nn.Conv2D(vq_embed_dim, latent_channels, 1)
# pass init params to Decoder
@@ -115,22 +118,24 @@ def __init__(
block_out_channels=block_out_channels,
layers_per_block=layers_per_block,
act_fn=act_fn,
- norm_num_groups=norm_num_groups, )
+ norm_num_groups=norm_num_groups,
+ )
- def encode(self, x: paddle.Tensor, return_dict: bool=True):
+ def encode(self, x: paddle.Tensor, return_dict: bool = True):
h = self.encoder(x)
h = self.quant_conv(h)
if not return_dict:
- return (h, )
+ return (h,)
return VQEncoderOutput(latents=h)
def decode(
- self,
- h: paddle.Tensor,
- force_not_quantize: bool=False,
- return_dict: bool=True, ):
+ self,
+ h: paddle.Tensor,
+ force_not_quantize: bool = False,
+ return_dict: bool = True,
+ ):
# cast h to float16 / float32
h = h.cast(self.dtype)
# also go through quantization layer
@@ -142,11 +147,11 @@ def decode(
dec = self.decoder(quant)
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
- def forward(self, sample: paddle.Tensor, return_dict: bool=True):
+ def forward(self, sample: paddle.Tensor, return_dict: bool = True):
r"""
Args:
sample (`paddle.Tensor`): Input sample.
@@ -158,6 +163,6 @@ def forward(self, sample: paddle.Tensor, return_dict: bool=True):
dec = self.decode(h).sample
if not return_dict:
- return (dec, )
+ return (dec,)
return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/optimization.py b/ppdiffusers/ppdiffusers/optimization.py
index 738ef9f4d113f..d6c5efafaed3f 100644
--- a/ppdiffusers/ppdiffusers/optimization.py
+++ b/ppdiffusers/ppdiffusers/optimization.py
@@ -34,7 +34,7 @@ class SchedulerType(Enum):
CONSTANT_WITH_WARMUP = "constant_with_warmup"
-def get_constant_schedule(learning_rate: float, last_epoch: int=-1):
+def get_constant_schedule(learning_rate: float, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate, using the learning rate set in optimizer.
@@ -50,9 +50,7 @@ def get_constant_schedule(learning_rate: float, last_epoch: int=-1):
return LambdaDecay(learning_rate, lambda _: 1, last_epoch=last_epoch)
-def get_constant_schedule_with_warmup(learning_rate: float,
- num_warmup_steps: int,
- last_epoch: int=-1):
+def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_steps: int, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
increases linearly between 0 and the initial lr set in the optimizer.
@@ -78,10 +76,11 @@ def lr_lambda(current_step: int):
def get_linear_schedule_with_warmup(
- learning_rate: float,
- num_warmup_steps: int,
- num_training_steps: int,
- last_epoch: int=-1, ):
+ learning_rate: float,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ last_epoch: int = -1,
+):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
@@ -105,18 +104,19 @@ def lr_lambda(current_step: int):
return float(current_step) / float(max(1, num_warmup_steps))
return max(
0.0,
- float(num_training_steps - current_step) /
- float(max(1, num_training_steps - num_warmup_steps)), )
+ float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)),
+ )
return LambdaDecay(learning_rate, lr_lambda, last_epoch)
def get_cosine_schedule_with_warmup(
- learning_rate: float,
- num_warmup_steps: int,
- num_training_steps: int,
- num_cycles: float=0.5,
- last_epoch: int=-1, ):
+ learning_rate: float,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ num_cycles: float = 0.5,
+ last_epoch: int = -1,
+):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
@@ -142,21 +142,19 @@ def get_cosine_schedule_with_warmup(
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
- progress = float(current_step - num_warmup_steps) / float(
- max(1, num_training_steps - num_warmup_steps))
- return max(
- 0.0, 0.5 *
- (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
return LambdaDecay(learning_rate, lr_lambda, last_epoch)
def get_cosine_with_hard_restarts_schedule_with_warmup(
- learning_rate: float,
- num_warmup_steps: int,
- num_training_steps: int,
- num_cycles: int=1,
- last_epoch: int=-1, ):
+ learning_rate: float,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ num_cycles: int = 1,
+ last_epoch: int = -1,
+):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
@@ -181,25 +179,25 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
- progress = float(current_step - num_warmup_steps) / float(
- max(1, num_training_steps - num_warmup_steps))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
if progress >= 1.0:
return 0.0
return max(
0.0,
- 0.5 * (1.0 + math.cos(math.pi * (
- (float(num_cycles) * progress) % 1.0))), )
+ 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))),
+ )
return LambdaDecay(learning_rate, lr_lambda, last_epoch)
def get_polynomial_decay_schedule_with_warmup(
- learning_rate: float,
- num_warmup_steps: int,
- num_training_steps: int,
- lr_end: float=1e-7,
- power: float=1.0,
- last_epoch: int=-1, ):
+ learning_rate: float,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ lr_end: float = 1e-7,
+ power: float = 1.0,
+ last_epoch: int = -1,
+):
"""
Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
@@ -230,8 +228,7 @@ def get_polynomial_decay_schedule_with_warmup(
lr_init = learning_rate
if not (lr_init > lr_end):
- raise ValueError(
- f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+ raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
@@ -251,8 +248,7 @@ def lr_lambda(current_step: int):
TYPE_TO_SCHEDULER_FUNCTION = {
SchedulerType.LINEAR: get_linear_schedule_with_warmup,
SchedulerType.COSINE: get_cosine_schedule_with_warmup,
- SchedulerType.COSINE_WITH_RESTARTS:
- get_cosine_with_hard_restarts_schedule_with_warmup,
+ SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
SchedulerType.CONSTANT: get_constant_schedule,
SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
@@ -260,13 +256,14 @@ def lr_lambda(current_step: int):
def get_scheduler(
- name: Union[str, SchedulerType],
- learning_rate: float=0.1,
- num_warmup_steps: Optional[int]=None,
- num_training_steps: Optional[int]=None,
- num_cycles: int=1,
- power: float=1.0,
- last_epoch: int=-1, ):
+ name: Union[str, SchedulerType],
+ learning_rate: float = 0.1,
+ num_warmup_steps: Optional[int] = None,
+ num_training_steps: Optional[int] = None,
+ num_cycles: int = 1,
+ power: float = 1.0,
+ last_epoch: int = -1,
+):
"""
Unified API to get any scheduler from its name.
@@ -295,20 +292,18 @@ def get_scheduler(
# All other schedulers require `num_warmup_steps`
if num_warmup_steps is None:
- raise ValueError(
- f"{name} requires `num_warmup_steps`, please provide that argument.")
+ raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
if name == SchedulerType.CONSTANT_WITH_WARMUP:
return schedule_func(
learning_rate=learning_rate,
num_warmup_steps=num_warmup_steps,
- last_epoch=last_epoch, )
+ last_epoch=last_epoch,
+ )
# All other schedulers require `num_training_steps`
if num_training_steps is None:
- raise ValueError(
- f"{name} requires `num_training_steps`, please provide that argument."
- )
+ raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
if name == SchedulerType.COSINE_WITH_RESTARTS:
return schedule_func(
@@ -316,7 +311,8 @@ def get_scheduler(
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
- last_epoch=last_epoch, )
+ last_epoch=last_epoch,
+ )
if name == SchedulerType.POLYNOMIAL:
return schedule_func(
@@ -324,10 +320,12 @@ def get_scheduler(
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
power=power,
- last_epoch=last_epoch, )
+ last_epoch=last_epoch,
+ )
return schedule_func(
learning_rate=learning_rate,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
- last_epoch=last_epoch, )
+ last_epoch=last_epoch,
+ )
diff --git a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
index 24a72c1aa650d..74f6bdbb6b2b6 100644
--- a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
@@ -25,11 +25,25 @@
from typing import Any, Callable, Dict, List, Optional, Tuple
from ..utils import (
- DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE,
- LOW_CPU_MEM_USAGE_DEFAULT, PPDIFFUSERS_CACHE, TO_DIFFUSERS, _add_variant,
- _get_model_file, get_logger, is_paddle_available, is_paddlenlp_available,
- is_ppxformers_available, is_safetensors_available, is_torch_available,
- is_torch_file, smart_load, str2bool)
+ DIFFUSERS_CACHE,
+ FROM_DIFFUSERS,
+ FROM_HF_HUB,
+ HF_HUB_OFFLINE,
+ LOW_CPU_MEM_USAGE_DEFAULT,
+ PPDIFFUSERS_CACHE,
+ TO_DIFFUSERS,
+ _add_variant,
+ _get_model_file,
+ get_logger,
+ is_paddle_available,
+ is_paddlenlp_available,
+ is_ppxformers_available,
+ is_safetensors_available,
+ is_torch_available,
+ is_torch_file,
+ smart_load,
+ str2bool,
+)
logger = get_logger(__name__)
@@ -60,8 +74,7 @@ def copy_func(f):
"Copy a non-builtin function (NB `copy.copy` does not work for this)"
if not isinstance(f, FunctionType):
return copy.copy(f)
- fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__,
- f.__closure__)
+ fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__)
fn.__kwdefaults__ = f.__kwdefaults__
fn.__dict__.update(f.__dict__)
fn.__annotations__.update(f.__annotations__)
@@ -81,7 +94,7 @@ def __get__(self, _, f_cls):
def patch_to(cls, as_prop=False, cls_method=False):
"Decorator: add `f` to `cls`"
if not isinstance(cls, (tuple, list)):
- cls = (cls, )
+ cls = (cls,)
def _inner(f):
for c_ in cls:
@@ -108,11 +121,11 @@ def _inner(f):
def is_floating_point(x):
if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
- raise TypeError(
- "Expected Tensor, but received type of x: {}".format(type(x)))
+ raise TypeError("Expected Tensor, but received type of x: {}".format(type(x)))
dtype = x.dtype
- is_fp_dtype = (dtype == paddle.float32 or dtype == paddle.float64 or
- dtype == paddle.float16 or dtype == paddle.bfloat16)
+ is_fp_dtype = (
+ dtype == paddle.float32 or dtype == paddle.float64 or dtype == paddle.float16 or dtype == paddle.bfloat16
+ )
return is_fp_dtype
if not hasattr(paddle, "is_floating_point"):
@@ -219,7 +232,8 @@ def Parameter(data: paddle.Tensor, requires_grad=True):
tensor = paddle.create_parameter(
data.shape,
dtype=data.dtype,
- default_initializer=nn.initializer.Assign(data), )
+ default_initializer=nn.initializer.Assign(data),
+ )
if not requires_grad:
tensor.stop_gradient = True
return tensor
@@ -247,8 +261,7 @@ def get_sublayer(self, target: str):
for item in atoms:
if not hasattr(mod, item):
- raise AttributeError(mod.__class__.__name__ + " has no "
- "attribute `" + item + "`")
+ raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`")
mod = getattr(mod, item)
@@ -259,23 +272,21 @@ def get_sublayer(self, target: str):
nn.Layer.get_sublayer = get_sublayer
class _WrappedHook:
- def __init__(self, hook: Callable, module: Optional["nn.Layer"]=None):
+ def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None):
self.hook: Callable = hook
functools.update_wrapper(self, hook)
self.with_module: bool = False
if module is not None:
- self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(
- module)
+ self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module)
self.with_module = True
def __call__(self, *args: Any, **kwargs: Any) -> Any:
if self.with_module:
module = self.module()
if module is None:
- raise RuntimeError(
- "You are trying to call the hook of a dead Module!")
+ raise RuntimeError("You are trying to call the hook of a dead Module!")
return self.hook(module, *args, **kwargs)
return self.hook(*args, **kwargs)
@@ -292,8 +303,7 @@ def __setstate__(self, state: Dict):
if self.with_module:
if state["module"] is None:
- raise RuntimeError(
- "You are trying to revive the hook of a dead Module!")
+ raise RuntimeError("You are trying to revive the hook of a dead Module!")
self.module = weakref.ref(state["module"])
try:
@@ -305,22 +315,20 @@ def register_load_state_dict_pre_hook(self, hook, with_module=False):
if not hasattr(self, "load_state_dict_pre_hooks"):
self.load_state_dict_pre_hooks = OrderedDict()
handle = HookRemoveHelper(self.load_state_dict_pre_hooks)
- self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(
- hook, self if with_module else None)
+ self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None)
return handle
nn.Layer.register_load_state_dict_pre_hook = register_load_state_dict_pre_hook
raw_set_state_dict = nn.Layer.set_state_dict
- def set_state_dict(self, state_dict, use_structured_name: bool=True):
+ def set_state_dict(self, state_dict, use_structured_name: bool = True):
if hasattr(self, "load_state_dict_pre_hooks"):
for hook in self.load_state_dict_pre_hooks.values():
hook(state_dict)
# POP is_torch_weight
state_dict.pop("is_torch_weight", None)
- return raw_set_state_dict(
- self, state_dict, use_structured_name=use_structured_name)
+ return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name)
nn.Layer.set_state_dict = set_state_dict
nn.Layer.load_dict = nn.Layer.set_state_dict
@@ -338,12 +346,12 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True):
from ..utils.paddle_utils import no_init_weights
if is_ppxformers_available():
- from paddle.incubate.nn.memory_efficient_attention import \
- memory_efficient_attention
+ from paddle.incubate.nn.memory_efficient_attention import (
+ memory_efficient_attention,
+ )
from paddle.nn.functional.flash_attention import flash_attention
- sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 +
- 64)
+ sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 + 64)
if sdp_kernel == "mem_efficient":
flash_attn_version = 1
else:
@@ -353,33 +361,32 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True):
flash_attn_error = None
try:
_ = flash_attention(
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16),
- paddle.ones(
- (1, 1, 2, 40), dtype=paddle.float16), )
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+ )
except Exception as error:
flash_attn_error = error
is_support_flash_attention = False
def scaled_dot_product_attention_(
- query,
- key,
- value,
- attn_mask=None,
- dropout_p=0.0,
- is_causal=False,
- scale=None,
- training=True,
- attention_op=None, ):
+ query,
+ key,
+ value,
+ attn_mask=None,
+ dropout_p=0.0,
+ is_causal=False,
+ scale=None,
+ training=True,
+ attention_op=None,
+ ):
if attention_op in [None, "auto"]:
head_dim = query.shape[-1]
attention_op = "cutlass"
if is_support_flash_attention and query.dtype in [
- paddle.float16,
- paddle.bfloat16,
+ paddle.float16,
+ paddle.bfloat16,
]:
if flash_attn_version == 1:
if head_dim <= 128:
@@ -403,17 +410,12 @@ def scaled_dot_product_attention_(
else:
if attn_mask is not None:
attn_mask = paddle.transpose(attn_mask, [0, 2, 1, 3])
- if (attn_mask.cast("float32").min() == 0 and
- attn_mask.cast("float32").max() == 1):
+ if attn_mask.cast("float32").min() == 0 and attn_mask.cast("float32").max() == 1:
attn_mask = (attn_mask.cast(s.dtype) - 1) * 10000.0
s = s + attn_mask
p = paddle.nn.functional.softmax(s, axis=-1)
if dropout_p > 0.0:
- p = paddle.nn.functional.dropout(
- p,
- dropout_p,
- training=training,
- mode="upscale_in_train")
+ p = paddle.nn.functional.dropout(p, dropout_p, training=training, mode="upscale_in_train")
o = paddle.matmul(p, vt)
return paddle.transpose(o, [0, 2, 1, 3])
elif attention_op == "cutlass":
@@ -427,7 +429,8 @@ def scaled_dot_product_attention_(
None,
p=dropout_p if training else 0.0,
scale=scale,
- training=True, ) # make sure we use training=True
+ training=True,
+ ) # make sure we use training=True
elif attention_op == "flash":
output = flash_attention(
query,
@@ -435,15 +438,13 @@ def scaled_dot_product_attention_(
value,
dropout=dropout_p,
causal=is_causal,
- return_softmax=False, )[0]
+ return_softmax=False,
+ )[0]
else:
- raise ValueError(
- "ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']"
- )
+ raise ValueError("ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']")
return output
- paddle.nn.functional.scaled_dot_product_attention_ = (
- scaled_dot_product_attention_)
+ paddle.nn.functional.scaled_dot_product_attention_ = scaled_dot_product_attention_
@patch_to(nn.Layer, as_prop=True)
def dtype(parameter: nn.Layer) -> paddle.dtype:
@@ -474,8 +475,10 @@ def device(self):
from shutil import copyfile
import sentencepiece as spm
- from paddlenlp.transformers.tokenizer_utils import (AddedToken,
- PretrainedTokenizer)
+ from paddlenlp.transformers.tokenizer_utils import (
+ AddedToken,
+ PretrainedTokenizer,
+ )
SPIECE_UNDERLINE = "▁"
@@ -495,24 +498,24 @@ class XLMRobertaTokenizer(PretrainedTokenizer):
model_input_names = ["input_ids", "attention_mask"]
def __init__(
- self,
- vocab_file,
- bos_token="",
- eos_token="",
- sep_token="",
- cls_token="",
- unk_token="",
- pad_token="",
- mask_token="",
- sp_model_kwargs: Optional[Dict[str, Any]]=None,
- **kwargs, ) -> None:
+ self,
+ vocab_file,
+ bos_token="",
+ eos_token="",
+ sep_token="",
+ cls_token="",
+ unk_token="",
+ pad_token="",
+ mask_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ) -> None:
# Mask token behave like a normal word, i.e. include the space before it
- mask_token = (AddedToken(
- mask_token, lstrip=True, rstrip=False)
- if isinstance(mask_token, str) else mask_token)
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+ )
- self.sp_model_kwargs = ({} if sp_model_kwargs is None else
- sp_model_kwargs)
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__(
bos_token=bos_token,
@@ -523,10 +526,10 @@ def __init__(
pad_token=pad_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
- **kwargs, )
+ **kwargs,
+ )
- self.sp_model = spm.SentencePieceProcessor(
- **self.sp_model_kwargs)
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -547,12 +550,8 @@ def __init__(
# The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
self.fairseq_offset = 1
- self.fairseq_tokens_to_ids[""] = (
- len(self.sp_model) + self.fairseq_offset)
- self.fairseq_ids_to_tokens = {
- v: k
- for k, v in self.fairseq_tokens_to_ids.items()
- }
+ self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
def __getstate__(self):
state = self.__dict__.copy()
@@ -567,14 +566,12 @@ def __setstate__(self, d):
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
- self.sp_model = spm.SentencePieceProcessor(
- **self.sp_model_kwargs)
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
def build_inputs_with_special_tokens(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]]=None) -> List[int]:
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
@@ -590,17 +587,17 @@ def build_inputs_with_special_tokens(
"""
if token_ids_1 is None:
- return [self.cls_token_id
- ] + token_ids_0 + [self.sep_token_id]
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]]=None,
- already_has_special_tokens: bool=False, ) -> List[int]:
+ self,
+ token_ids_0: List[int],
+ token_ids_1: Optional[List[int]] = None,
+ already_has_special_tokens: bool = False,
+ ) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
@@ -619,17 +616,16 @@ def get_special_tokens_mask(
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
- already_has_special_tokens=True, )
+ already_has_special_tokens=True,
+ )
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
- return ([1] + ([0] * len(token_ids_0)) + [1, 1] +
- ([0] * len(token_ids_1)) + [1])
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]]=None) -> List[int]:
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
@@ -647,19 +643,14 @@ def create_token_type_ids_from_sequences(
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
- return len(cls + token_ids_0 + sep + sep + token_ids_1 +
- sep) * [0]
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
- return (len(self.sp_model) + self.fairseq_offset + 1
- ) # Add the token
+ return len(self.sp_model) + self.fairseq_offset + 1 # Add the token
def get_vocab(self):
- vocab = {
- self.convert_ids_to_tokens(i): i
- for i in range(self.vocab_size)
- }
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
@@ -683,33 +674,28 @@ def _convert_id_to_token(self, index):
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
- out_string = "".join(tokens).replace(SPIECE_UNDERLINE,
- " ").strip()
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
- def save_vocabulary(
- self, save_directory: str,
- filename_prefix: Optional[str]=None) -> Tuple[str]:
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
- logger.error(
- f"Vocabulary path ({save_directory}) should be a directory"
- )
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
- (filename_prefix + "-" if filename_prefix else "") +
- self.resource_files_names["vocab_file"], )
+ (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
+ )
- if os.path.abspath(self.vocab_file) != os.path.abspath(
- out_vocab_file) and os.path.isfile(self.vocab_file):
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
+ self.vocab_file
+ ):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
- content_spiece_model = self.sp_model.serialized_model_proto(
- )
+ content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
- return (out_vocab_file, )
+ return (out_vocab_file,)
paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer
@@ -719,16 +705,17 @@ def save_vocabulary(
BertModel.raw_forward = BertModel.forward
def forward_new(
- self,
- input_ids: paddle.Tensor,
- token_type_ids: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]]=None,
- use_cache: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- output_attentions: Optional[bool]=None,
- return_dict: Optional[bool]=None, ):
+ self,
+ input_ids: paddle.Tensor,
+ token_type_ids: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ):
if attention_mask is None:
attention_mask = paddle.ones_like(input_ids)
return self.raw_forward(
@@ -740,7 +727,8 @@ def forward_new(
use_cache=use_cache,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
BertModel.forward = forward_new
@@ -748,13 +736,10 @@ def forward_new(
TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin"
# patch from_pretrained and save_pretrained
- def from_pretrained_v3(cls,
- pretrained_model_name_or_path,
- *args,
- from_hf_hub: bool=False,
- **kwargs):
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ def from_pretrained_v3(cls, pretrained_model_name_or_path, *args, from_hf_hub: bool = False, **kwargs):
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
force_download = kwargs.pop("force_download", False)
from_diffusers = kwargs.pop("from_diffusers", None)
@@ -773,8 +758,7 @@ def from_pretrained_v3(cls,
paddle_dtype = _dtype
subfolder = kwargs.pop("subfolder", None)
variant = kwargs.pop("variant", None)
- low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
- LOW_CPU_MEM_USAGE_DEFAULT)
+ low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
user_agent = {
"ppdiffusers": __version__,
@@ -787,8 +771,7 @@ def from_pretrained_v3(cls,
model_kwargs = kwargs
# 1. get the PretrainedConfig to init model
if not isinstance(config, PretrainedConfig):
- config_path = (config if config is not None else
- pretrained_model_name_or_path)
+ config_path = config if config is not None else pretrained_model_name_or_path
# TODO fix config from_pretrained
# must from hf hub
@@ -797,9 +780,11 @@ def from_pretrained_v3(cls,
kwargs["subfolder"] = subfolder
else:
if subfolder is not None:
- config_path = (os.path.join(config_path, subfolder)
- if os.path.isdir(config_path) else
- "/".join([config_path, subfolder]))
+ config_path = (
+ os.path.join(config_path, subfolder)
+ if os.path.isdir(config_path)
+ else "/".join([config_path, subfolder])
+ )
config, model_kwargs = cls.config_class.from_pretrained(
config_path,
@@ -807,12 +792,12 @@ def from_pretrained_v3(cls,
return_unused_kwargs=True,
force_download=force_download,
from_hf_hub=from_hf_hub,
- **kwargs, )
+ **kwargs,
+ )
assert config is not None
# we will remove in the future.
- if not from_hf_hub and not os.path.exists(
- os.path.join(cache_dir, config_path, "config.json")):
+ if not from_hf_hub and not os.path.exists(os.path.join(cache_dir, config_path, "config.json")):
config.save_pretrained(os.path.join(cache_dir, config_path))
if paddle_dtype is None:
@@ -825,8 +810,7 @@ def from_pretrained_v3(cls,
try:
model_file = _get_model_file(
pretrained_model_name_or_path,
- weights_name=_add_variant(
- TRANSFORMERS_SAFE_WEIGHTS_NAME, variant),
+ weights_name=_add_variant(TRANSFORMERS_SAFE_WEIGHTS_NAME, variant),
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -836,15 +820,15 @@ def from_pretrained_v3(cls,
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
except Exception: # noqa: E722
model_file = None
pass
if model_file is None:
model_file = _get_model_file(
pretrained_model_name_or_path,
- weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME,
- variant),
+ weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME, variant),
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -854,7 +838,8 @@ def from_pretrained_v3(cls,
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
else:
model_file = _get_model_file(
pretrained_model_name_or_path,
@@ -868,19 +853,20 @@ def from_pretrained_v3(cls,
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
assert model_file is not None
# try load model_file with paddle / torch / safetensor
state_dict = smart_load(model_file)
init_contexts = []
- dtype = set(v.dtype for v in state_dict.values()
- if paddle.is_tensor(v) and paddle.is_floating_point(v))
+ dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
if len(dtype) > 1 and paddle.float32 not in dtype:
raise ValueError(
f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
- f" make sure that {model_file} weights have only one dtype.")
+ f" make sure that {model_file} weights have only one dtype."
+ )
elif len(dtype) > 1 and paddle.float32 in dtype:
dtype = paddle.float32
elif len(dtype) == 0:
@@ -900,22 +886,18 @@ def from_pretrained_v3(cls,
model = cls(config, **model_kwargs)
# convert weights
- if (from_diffusers or is_torch_file(model_file)) and hasattr(
- cls, "smart_convert"):
+ if (from_diffusers or is_torch_file(model_file)) and hasattr(cls, "smart_convert"):
state_dict = cls.smart_convert(state_dict, model)
loaded_state_dict_keys = list(state_dict.keys())
- (
- model,
- missing_keys,
- unexpected_keys,
- mismatched_keys, ) = cls._load_pretrained_model_old(
- model=model,
- state_dict=state_dict,
- loaded_keys=loaded_state_dict_keys,
- ignore_mismatched_sizes=ignore_mismatched_sizes,
- dtype=None, )
+ (model, missing_keys, unexpected_keys, mismatched_keys,) = cls._load_pretrained_model_old(
+ model=model,
+ state_dict=state_dict,
+ loaded_keys=loaded_state_dict_keys,
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
+ dtype=None,
+ )
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
@@ -941,9 +923,7 @@ def from_pretrained_v3(cls,
" (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
)
else:
- logger.info(
- f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
- )
+ logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
@@ -956,17 +936,21 @@ def from_pretrained_v3(cls,
f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
- " training.")
+ " training."
+ )
if len(mismatched_keys) > 0:
- mismatched_warning = "\n".join([
- f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
- for key, shape1, shape2 in mismatched_keys
- ])
+ mismatched_warning = "\n".join(
+ [
+ f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+ for key, shape1, shape2 in mismatched_keys
+ ]
+ )
logger.warning(
f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
- " to use it for predictions and inference.")
+ " to use it for predictions and inference."
+ )
if output_loading_info:
return model, loading_info
@@ -979,12 +963,13 @@ def from_pretrained_v3(cls,
@classmethod
def _load_pretrained_model_old(
- cls,
- model: PretrainedModel,
- state_dict: Dict[str, paddle.Tensor],
- loaded_keys: List[str],
- ignore_mismatched_sizes=False,
- dtype=None, ) -> Tuple[List[str]]:
+ cls,
+ model: PretrainedModel,
+ state_dict: Dict[str, paddle.Tensor],
+ loaded_keys: List[str],
+ ignore_mismatched_sizes=False,
+ dtype=None,
+ ) -> Tuple[List[str]]:
model_state_dict = model.state_dict()
expected_keys = list(model_state_dict.keys())
@@ -992,8 +977,7 @@ def _load_pretrained_model_old(
if len(prefix) > 0:
has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
- expects_prefix_module = any(
- s.startswith(prefix) for s in expected_keys)
+ expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
else:
has_prefix_module = False
expects_prefix_module = False
@@ -1004,10 +988,7 @@ def _load_pretrained_model_old(
add_prefix_to_model = has_prefix_module and not expects_prefix_module
if remove_prefix_from_model:
- expected_keys = [
- ".".join(s.split(".")[1:]) if s.startswith(prefix) else s
- for s in expected_keys
- ]
+ expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys]
elif add_prefix_to_model:
expected_keys = [".".join([prefix, s]) for s in expected_keys]
@@ -1018,31 +999,26 @@ def _load_pretrained_model_old(
# the user.
if cls._keys_to_ignore_on_load_missing is not None:
for pat in cls._keys_to_ignore_on_load_missing:
- missing_keys = [
- k for k in missing_keys if re.search(pat, k) is None
- ]
+ missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
if cls._keys_to_ignore_on_load_unexpected is not None:
for pat in cls._keys_to_ignore_on_load_unexpected:
- unexpected_keys = [
- k for k in unexpected_keys if re.search(pat, k) is None
- ]
+ unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
# Make sure we are able to load base models as well as derived models (with heads)
start_prefix = ""
model_to_load = model
- if (len(cls.base_model_prefix) > 0 and
- not hasattr(model, cls.base_model_prefix) and
- has_prefix_module):
+ if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module:
start_prefix = cls.base_model_prefix + "."
def _find_mismatched_keys(
- state_dict,
- model_state_dict,
- loaded_keys,
- add_prefix_to_model,
- remove_prefix_from_model,
- ignore_mismatched_sizes, ):
+ state_dict,
+ model_state_dict,
+ loaded_keys,
+ add_prefix_to_model,
+ remove_prefix_from_model,
+ ignore_mismatched_sizes,
+ ):
mismatched_keys = []
if ignore_mismatched_sizes:
for checkpoint_key in loaded_keys:
@@ -1054,13 +1030,17 @@ def _find_mismatched_keys(
# The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it.
model_key = ".".join(checkpoint_key.split(".")[1:])
- if (model_key in model_state_dict and
- state_dict[checkpoint_key].shape !=
- model_state_dict[model_key].shape):
- mismatched_keys.append((
- checkpoint_key,
- state_dict[checkpoint_key].shape,
- model_state_dict[model_key].shape, ))
+ if (
+ model_key in model_state_dict
+ and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+ ):
+ mismatched_keys.append(
+ (
+ checkpoint_key,
+ state_dict[checkpoint_key].shape,
+ model_state_dict[model_key].shape,
+ )
+ )
del state_dict[checkpoint_key]
return mismatched_keys
@@ -1071,7 +1051,8 @@ def _find_mismatched_keys(
loaded_keys,
add_prefix_to_model,
remove_prefix_from_model,
- ignore_mismatched_sizes, )
+ ignore_mismatched_sizes,
+ )
start_prefix = prefix + "."
@@ -1090,8 +1071,7 @@ def _find_mismatched_keys(
if add_prefix_to_model:
for key in list(state_dict.keys()):
if key.startswith(start_prefix):
- state_dict[key.replace(start_prefix, "")] = state_dict.pop(
- key)
+ state_dict[key.replace(start_prefix, "")] = state_dict.pop(key)
if remove_prefix_from_model:
for key in list(state_dict.keys()):
@@ -1126,12 +1106,9 @@ def _find_mismatched_keys(
# this is the temp hard code for fused-mt transformer
if model.keep_in_fp32_modules(key, model.config, dtype):
target_dtype = "float32"
- state_dict[key] = paddle.cast(
- state_dict[key], dtype=target_dtype)
+ state_dict[key] = paddle.cast(state_dict[key], dtype=target_dtype)
else:
- raise ValueError(
- f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid"
- )
+ raise ValueError(f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid")
else:
dtype_prefix_len = len("paddle.")
for k, v in model_to_load.state_dict().items():
@@ -1155,8 +1132,7 @@ def _find_mismatched_keys(
# To avoid recursive import temporarily.
import paddlenlp.ops.fast_transformer.transformer.decoding as ft_decoding
- state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(
- model_to_load, state_dict)
+ state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict)
if paddle.in_dynamic_mode():
model_to_load.set_state_dict(state_to_load)
@@ -1170,19 +1146,20 @@ def _find_mismatched_keys(
@classmethod
def from_pretrained(
- cls,
- pretrained_model_name_or_path,
- *args,
- from_hf_hub=False,
- subfolder=None,
- paddle_dtype=None,
- from_diffusers=None,
- variant=None,
- **kwargs, ):
+ cls,
+ pretrained_model_name_or_path,
+ *args,
+ from_hf_hub=False,
+ subfolder=None,
+ paddle_dtype=None,
+ from_diffusers=None,
+ variant=None,
+ **kwargs,
+ ):
try:
if cls.constructed_from_pretrained_config() and (
- hasattr(cls, "smart_convert") or
- hasattr(cls, "register_load_torch_hook")):
+ hasattr(cls, "smart_convert") or hasattr(cls, "register_load_torch_hook")
+ ):
return from_pretrained_v3(
cls,
pretrained_model_name_or_path,
@@ -1192,7 +1169,8 @@ def from_pretrained(
paddle_dtype=paddle_dtype,
from_diffusers=from_diffusers,
variant=variant,
- **kwargs, )
+ **kwargs,
+ )
except Exception:
pass
@@ -1206,7 +1184,8 @@ def from_pretrained(
from_hf_hub=from_hf_hub,
subfolder=subfolder,
dtype=dtype,
- **kwargs, )
+ **kwargs,
+ )
PretrainedModel.from_pretrained = from_pretrained
@@ -1214,51 +1193,43 @@ def from_pretrained(
from safetensors.numpy import save_file as safetensors_numpy_save_file
if is_torch_available():
- from safetensors.torch import \
- save_file as safetensors_torch_save_file
+ from safetensors.torch import save_file as safetensors_torch_save_file
if is_torch_available():
import torch
def save_pretrained_v3(
- self: PretrainedModel,
- save_directory: str,
- is_main_process: bool=True,
- save_function: Callable=None,
- safe_serialization: bool=False,
- variant: Optional[str]=None,
- to_diffusers: Optional[bool]=None, ):
- from ..models.modeling_pytorch_paddle_utils import \
- convert_paddle_state_dict_to_pytorch
+ self: PretrainedModel,
+ save_directory: str,
+ is_main_process: bool = True,
+ save_function: Callable = None,
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ to_diffusers: Optional[bool] = None,
+ ):
+ from ..models.modeling_pytorch_paddle_utils import (
+ convert_paddle_state_dict_to_pytorch,
+ )
from ..models.modeling_utils import convert_state_dict
if to_diffusers is None:
to_diffusers = TO_DIFFUSERS
- if to_diffusers and safe_serialization and not is_safetensors_available(
- ):
- raise ImportError(
- "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
- )
+ if to_diffusers and safe_serialization and not is_safetensors_available():
+ raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
if os.path.isfile(save_directory):
- logger.error(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
- model_to_save = self._layers if isinstance(
- self, paddle.DataParallel) else self
+ model_to_save = self._layers if isinstance(self, paddle.DataParallel) else self
if is_main_process:
try:
- model_to_save.config.dtype = str(model_to_save._dtype).split(
- ".")[-1]
+ model_to_save.config.dtype = str(model_to_save._dtype).split(".")[-1]
except:
model_to_save.config.dtype = "float32"
# Attach architecture to the config
- model_to_save.config.architectures = [
- model_to_save.__class__.__name__
- ]
+ model_to_save.config.architectures = [model_to_save.__class__.__name__]
model_to_save.config.save_pretrained(save_directory)
@@ -1273,12 +1244,10 @@ def save_pretrained_v3(
if safe_serialization:
if is_torch_available():
save_function = safetensors_torch_save_file
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
else:
save_function = safetensors_numpy_save_file
- state_dict = convert_state_dict(
- state_dict, framework="numpy")
+ state_dict = convert_state_dict(state_dict, framework="numpy")
weights_name = _add_variant("model.safetensors", variant)
else:
if not is_torch_available():
@@ -1287,11 +1256,9 @@ def save_pretrained_v3(
)
save_function = torch.save
weights_name = _add_variant("pytorch_model.bin", variant)
- state_dict = convert_state_dict(
- state_dict, framework="torch")
+ state_dict = convert_state_dict(state_dict, framework="torch")
- state_dict = convert_paddle_state_dict_to_pytorch(state_dict,
- model_to_save)
+ state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
else:
save_function = paddle.save
weights_name = _add_variant("model_state.pdparams", variant)
@@ -1299,24 +1266,22 @@ def save_pretrained_v3(
# Save the model
save_function(state_dict, os.path.join(save_directory, weights_name))
- logger.info(
- f"Model weights saved in {os.path.join(save_directory, weights_name)}"
- )
+ logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
def save_pretrained(
- self,
- save_dir: str,
- is_main_process: bool=True,
- state_dict=None,
- save_function: Callable=None,
- max_shard_size="10GB",
- safe_serialization: bool=False,
- variant: Optional[str]=None,
- to_diffusers: Optional[bool]=None,
- *args,
- **kwargs, ):
- if self.constructed_from_pretrained_config() and hasattr(
- self, "smart_convert"):
+ self,
+ save_dir: str,
+ is_main_process: bool = True,
+ state_dict=None,
+ save_function: Callable = None,
+ max_shard_size="10GB",
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ to_diffusers: Optional[bool] = None,
+ *args,
+ **kwargs,
+ ):
+ if self.constructed_from_pretrained_config() and hasattr(self, "smart_convert"):
return save_pretrained_v3(
self,
save_dir,
@@ -1324,7 +1289,8 @@ def save_pretrained(
save_function=save_function,
safe_serialization=safe_serialization,
variant=variant,
- to_diffusers=to_diffusers, )
+ to_diffusers=to_diffusers,
+ )
return raw_save_pretrained(
self,
save_dir=save_dir,
@@ -1335,32 +1301,40 @@ def save_pretrained(
safe_serialization=safe_serialization,
variant=variant,
*args,
- **kwargs, )
+ **kwargs,
+ )
PretrainedModel.save_pretrained = save_pretrained
from paddlenlp.transformers import (
- BertModel, BitBackbone, ClapTextModelWithProjection, CLIPTextModel,
- CLIPTextModelWithProjection, CLIPVisionModel,
- CLIPVisionModelWithProjection, DPTForDepthEstimation, SpeechT5HifiGan,
- T5EncoderModel)
+ BertModel,
+ BitBackbone,
+ ClapTextModelWithProjection,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPVisionModel,
+ CLIPVisionModelWithProjection,
+ DPTForDepthEstimation,
+ SpeechT5HifiGan,
+ T5EncoderModel,
+ )
if not hasattr(T5EncoderModel, "_keep_in_fp32_modules"):
T5EncoderModel._keep_in_fp32_modules = ["wo"]
- from ..models.modeling_pytorch_paddle_utils import \
- convert_pytorch_state_dict_to_paddle_class_method
- from ..pipelines.alt_diffusion.modeling_roberta_series import \
- RobertaSeriesModelWithTransformation
+ from ..models.modeling_pytorch_paddle_utils import (
+ convert_pytorch_state_dict_to_paddle_class_method,
+ )
+ from ..pipelines.alt_diffusion.modeling_roberta_series import (
+ RobertaSeriesModelWithTransformation,
+ )
from ..pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
- from ..pipelines.latent_diffusion.pipeline_latent_diffusion import \
- LDMBertModel
- from ..pipelines.paint_by_example.image_encoder import \
- PaintByExampleImageEncoder
- from ..pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
- from ..pipelines.stable_diffusion_safe.safety_checker import \
- SafeStableDiffusionSafetyChecker
+ from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
+ from ..pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder
+ from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+ from ..pipelines.stable_diffusion_safe.safety_checker import (
+ SafeStableDiffusionSafetyChecker,
+ )
@classmethod
def clip_smart_convert(cls, state_dict, pd_model):
@@ -1380,7 +1354,9 @@ def clip_smart_convert(cls, state_dict, pd_model):
".pre_layrnorm.": ".ln_pre.",
".post_layernorm.": ".ln_post.",
}
- ignore_value = ["position_ids", ]
+ ignore_value = [
+ "position_ids",
+ ]
if cls in [PaintByExampleImageEncoder]:
# ignore mapper. prefix, we will use convert_pytorch_state_dict_to_paddle to convert mapper.xxxx state_dict
ignore_value.append("mapper.")
@@ -1410,11 +1386,11 @@ def clip_smart_convert(cls, state_dict, pd_model):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale" and value.ndim == 1:
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name and cls in [
- StableDiffusionSafetyChecker,
- SafeStableDiffusionSafetyChecker,
+ StableDiffusionSafetyChecker,
+ SafeStableDiffusionSafetyChecker,
]:
name = "clip." + name
new_model_state[name] = value
@@ -1423,8 +1399,7 @@ def clip_smart_convert(cls, state_dict, pd_model):
if cls in [PaintByExampleImageEncoder]:
# convert mapper
- mappersd = cls.smart_convert(
- state_dict, pd_model, sub_layer="mapper.")
+ mappersd = cls.smart_convert(state_dict, pd_model, sub_layer="mapper.")
new_model_state.update(mappersd)
return new_model_state
@@ -1451,10 +1426,8 @@ def bert_smart_convert(cls, state_dict, pd_model):
# about cls predictions ignore
"cls.predictions.transform.dense": "cls.predictions.transform",
"cls.predictions.decoder.weight": "cls.predictions.decoder_weight",
- "cls.predictions.transform.LayerNorm.weight":
- "cls.predictions.layer_norm.weight",
- "cls.predictions.transform.LayerNorm.bias":
- "cls.predictions.layer_norm.bias",
+ "cls.predictions.transform.LayerNorm.weight": "cls.predictions.layer_norm.weight",
+ "cls.predictions.transform.LayerNorm.bias": "cls.predictions.layer_norm.bias",
"cls.predictions.bias": "cls.predictions.decoder_bias",
}
ignore_value = ["position_ids"]
@@ -1481,8 +1454,7 @@ def bert_smart_convert(cls, state_dict, pd_model):
def ldmbert_smart_convert(cls, state_dict, pd_model):
transformers2ppnlp = {
"model.embed_tokens.weight": "embeddings.word_embeddings.weight",
- "model.embed_positions.weight":
- "embeddings.position_embeddings.weight",
+ "model.embed_positions.weight": "embeddings.position_embeddings.weight",
"model.layer_norm.": "final_layer_norm.",
"model.layers": "encoder.layers",
".self_attn_layer_norm.": ".norm1.",
@@ -1513,14 +1485,14 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
LDMBertModel.smart_convert = ldmbert_smart_convert
for cls_ in [
- CLIPTextModel,
- CLIPTextModelWithProjection,
- CLIPVisionModel,
- CLIPVisionModelWithProjection,
- StableDiffusionSafetyChecker,
- SafeStableDiffusionSafetyChecker,
- PaintByExampleImageEncoder,
- IFSafetyChecker,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPVisionModel,
+ CLIPVisionModelWithProjection,
+ StableDiffusionSafetyChecker,
+ SafeStableDiffusionSafetyChecker,
+ PaintByExampleImageEncoder,
+ IFSafetyChecker,
]:
setattr(cls_, "smart_convert", clip_smart_convert)
@@ -1532,8 +1504,12 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
else:
# NEW TRANSFORMERS CLIP MODEL
from ..pipelines.stable_diffusion.hf_clip_model import (
- HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection,
- HFCLIPVisionModel, HFCLIPVisionModelWithProjection)
+ HFCLIPModel,
+ HFCLIPTextModel,
+ HFCLIPTextModelWithProjection,
+ HFCLIPVisionModel,
+ HFCLIPVisionModelWithProjection,
+ )
TRANSFORMERS_CLIP_MODEL = [
HFCLIPModel,
@@ -1543,29 +1519,27 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
HFCLIPVisionModelWithProjection,
]
for cls_ in [
- DPTForDepthEstimation,
- BitBackbone,
- SpeechT5HifiGan,
- ClapTextModelWithProjection,
- T5EncoderModel,
+ DPTForDepthEstimation,
+ BitBackbone,
+ SpeechT5HifiGan,
+ ClapTextModelWithProjection,
+ T5EncoderModel,
] + TRANSFORMERS_CLIP_MODEL:
- setattr(cls_, "smart_convert",
- convert_pytorch_state_dict_to_paddle_class_method)
+ setattr(cls_, "smart_convert", convert_pytorch_state_dict_to_paddle_class_method)
# TODO remove this when we updage ImageProcessingMixin
# patch get_image_processor_dict support subfolder.
IMAGE_PROCESSOR_NAME = "preprocessor_config.json"
- from paddlenlp.transformers.feature_extraction_utils import \
- FeatureExtractionMixin
- from paddlenlp.transformers.image_processing_utils import \
- ImageProcessingMixin
+ from paddlenlp.transformers.feature_extraction_utils import FeatureExtractionMixin
+ from paddlenlp.transformers.image_processing_utils import ImageProcessingMixin
@classmethod
def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs):
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
@@ -1589,12 +1563,11 @@ def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs):
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
- from_hf_hub=from_hf_hub, )
+ from_hf_hub=from_hf_hub,
+ )
try:
# Load image_processor dict
- with open(
- resolved_image_processor_file, "r",
- encoding="utf-8") as reader:
+ with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
text = reader.read()
image_processor_dict = json.loads(text)
diff --git a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
index 367da2b281b53..7000346e862f7 100644
--- a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
@@ -32,41 +32,36 @@
def scatter_reduce(
- input: paddle.Tensor,
- dim: int,
- index: paddle.Tensor,
- src: paddle.Tensor,
- reduce: str="mean",
- include_self: bool=True, ) -> paddle.Tensor:
+ input: paddle.Tensor,
+ dim: int,
+ index: paddle.Tensor,
+ src: paddle.Tensor,
+ reduce: str = "mean",
+ include_self: bool = True,
+) -> paddle.Tensor:
# reduce "sum", "prod", "mean",
# TODO support "amax", "amin" and include_self = False
if reduce in ["sum", "assign", "add"]:
if reduce == "sum":
reduce = "add"
- input.put_along_axis_(
- indices=index, values=src, axis=dim, reduce=reduce)
+ input.put_along_axis_(indices=index, values=src, axis=dim, reduce=reduce)
elif reduce == "mean":
# compute sum first
input.put_along_axis_(indices=index, values=src, axis=dim, reduce="add")
# compute div secondly
input_div = paddle.ones_like(input).put_along_axis(
indices=index,
- values=paddle.to_tensor(
- 1.0, dtype=input.dtype),
+ values=paddle.to_tensor(1.0, dtype=input.dtype),
axis=dim,
- reduce="add", )
+ reduce="add",
+ )
input = input / input_div
elif reduce in ["prod", "mul", "multiply"]:
- input = paddle.put_along_axis(
- input.cpu(),
- indices=index.cpu(),
- values=src.cpu(),
- axis=dim,
- reduce="mul")._to(device=paddle.get_device())
- else:
- raise NotImplementedError(
- "only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!"
+ input = paddle.put_along_axis(input.cpu(), indices=index.cpu(), values=src.cpu(), axis=dim, reduce="mul")._to(
+ device=paddle.get_device()
)
+ else:
+ raise NotImplementedError("only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!")
return input
@@ -75,18 +70,19 @@ def scatter_reduce(
paddle.Tensor.scatter_reduce = scatter_reduce
-def do_nothing(x: paddle.Tensor, mode: str=None):
+def do_nothing(x: paddle.Tensor, mode: str = None):
return x
def bipartite_soft_matching_random2d(
- metric: paddle.Tensor,
- w: int,
- h: int,
- sx: int,
- sy: int,
- r: int,
- no_rand: bool=False, ) -> Tuple[Callable, Callable]:
+ metric: paddle.Tensor,
+ w: int,
+ h: int,
+ sx: int,
+ sy: int,
+ r: int,
+ no_rand: bool = False,
+) -> Tuple[Callable, Callable]:
"""
Partitions the tokens into src and dst and merges r tokens from src to dst.
Dst tokens are partitioned by choosing one randomy in each (sx, sy) region.
@@ -112,24 +108,23 @@ def bipartite_soft_matching_random2d(
if no_rand:
rand_idx = paddle.zeros((hsy, wsx, 1), dtype=paddle.int64)
else:
- rand_idx = paddle.randint(
- sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64)
+ rand_idx = paddle.randint(sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64)
# The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead
idx_buffer_view = paddle.zeros([hsy, wsx, sy * sx], dtype=paddle.int64)
idx_buffer_view.put_along_axis_(
axis=2,
indices=rand_idx,
- values=-paddle.ones_like(
- rand_idx, dtype=rand_idx.dtype), )
- idx_buffer_view = (idx_buffer_view.reshape([hsy, wsx, sy, sx])
- .transpose([0, 2, 1, 3])
- .reshape([hsy * sy, wsx * sx]))
+ values=-paddle.ones_like(rand_idx, dtype=rand_idx.dtype),
+ )
+ idx_buffer_view = (
+ idx_buffer_view.reshape([hsy, wsx, sy, sx]).transpose([0, 2, 1, 3]).reshape([hsy * sy, wsx * sx])
+ )
# Image is not divisible by sx or sy so we need to move it into a new buffer
if (hsy * sy) < h or (wsx * sx) < w:
idx_buffer = paddle.zeros([h, w], dtype=paddle.int64)
- idx_buffer[:(hsy * sy), :(wsx * sx)] = idx_buffer_view
+ idx_buffer[: (hsy * sy), : (wsx * sx)] = idx_buffer_view
else:
idx_buffer = idx_buffer_view
@@ -147,10 +142,8 @@ def bipartite_soft_matching_random2d(
def split(x):
C = x.shape[-1]
- src = x.take_along_axis(
- indices=a_idx.expand([B, N - num_dst, C]), axis=1)
- dst = x.take_along_axis(
- indices=b_idx.expand([B, num_dst, C]), axis=1)
+ src = x.take_along_axis(indices=a_idx.expand([B, N - num_dst, C]), axis=1)
+ dst = x.take_along_axis(indices=b_idx.expand([B, num_dst, C]), axis=1)
return src, dst
# Cosine similarity between A and B
@@ -178,12 +171,10 @@ def merge(x: paddle.Tensor, mode="mean") -> paddle.Tensor:
src, dst = split(x)
n, t1, c = src.shape
- unm = src.take_along_axis(
- indices=unm_idx.expand([n, t1 - r, c]), axis=-2)
+ unm = src.take_along_axis(indices=unm_idx.expand([n, t1 - r, c]), axis=-2)
src = src.take_along_axis(indices=src_idx.expand([n, r, c]), axis=-2)
- dst = scatter_reduce(
- dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode)
+ dst = scatter_reduce(dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode)
return paddle.concat([unm, dst], axis=1)
@@ -200,25 +191,27 @@ def unmerge(x: paddle.Tensor) -> paddle.Tensor:
out.put_along_axis_(
indices=b_idx.expand([B, num_dst, c]),
values=dst,
- axis=-2, )
+ axis=-2,
+ )
out.put_along_axis_(
- indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(
- indices=unm_idx, axis=1).expand([B, unm_len, c]),
+ indices=a_idx.expand([B, a_idx.shape[1], 1])
+ .take_along_axis(indices=unm_idx, axis=1)
+ .expand([B, unm_len, c]),
values=unm,
- axis=-2, )
+ axis=-2,
+ )
out.put_along_axis_(
- indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(
- indices=src_idx, axis=1).expand([B, r, c]),
+ indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(indices=src_idx, axis=1).expand([B, r, c]),
values=src,
- axis=-2, )
+ axis=-2,
+ )
return out
return merge, unmerge
-def compute_merge(x: paddle.Tensor,
- tome_info: Dict[str, Any]) -> Tuple[Callable, ...]:
+def compute_merge(x: paddle.Tensor, tome_info: Dict[str, Any]) -> Tuple[Callable, ...]:
original_h, original_w = tome_info["size"]
original_tokens = original_h * original_w
downsample = int(math.ceil(math.sqrt(original_tokens // x.shape[1])))
@@ -232,8 +225,7 @@ def compute_merge(x: paddle.Tensor,
# If the batch size is odd, then it's not possible for promted and unprompted images to be in the same
# batch, which causes artifacts with use_rand, so force it to be off.
use_rand = False if x.shape[0] % 2 == 1 else args["use_rand"]
- m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"],
- r, not use_rand)
+ m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"], r, not use_rand)
else:
m, u = (do_nothing, do_nothing)
@@ -255,31 +247,27 @@ class ToMeBasicTransformerBlock(block_class):
_parent = block_class
def forward(
- self: BasicTransformerBlock,
- hidden_states,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- timestep=None,
- cross_attention_kwargs=None,
- class_labels=None, ) -> paddle.Tensor:
+ self: BasicTransformerBlock,
+ hidden_states,
+ attention_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ timestep=None,
+ cross_attention_kwargs=None,
+ class_labels=None,
+ ) -> paddle.Tensor:
# (1) ToMe
- m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states,
- self._tome_info)
+ m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states, self._tome_info)
if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep)
elif self.use_ada_layer_norm_zero:
- (
- norm_hidden_states,
- gate_msa,
- shift_mlp,
- scale_mlp,
- gate_mlp, ) = self.norm1(
- hidden_states,
- timestep,
- class_labels,
- hidden_dtype=hidden_states.dtype, )
+ (norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp,) = self.norm1(
+ hidden_states,
+ timestep,
+ class_labels,
+ hidden_dtype=hidden_states.dtype,
+ )
else:
norm_hidden_states = self.norm1(hidden_states)
@@ -287,15 +275,13 @@ def forward(
norm_hidden_states = m_a(norm_hidden_states)
# 1. Self-Attention
- cross_attention_kwargs = (cross_attention_kwargs
- if cross_attention_kwargs is not None else
- {})
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
attn_output = self.attn1(
norm_hidden_states,
- encoder_hidden_states=encoder_hidden_states
- if self.only_cross_attention else None,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
if self.use_ada_layer_norm_zero:
attn_output = gate_msa.unsqueeze(1) * attn_output
@@ -303,9 +289,9 @@ def forward(
hidden_states = u_a(attn_output) + hidden_states
if self.attn2 is not None:
- norm_hidden_states = (self.norm2(hidden_states, timestep)
- if self.use_ada_layer_norm else
- self.norm2(hidden_states))
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
# (4) ToMe m_c
norm_hidden_states = m_c(norm_hidden_states)
@@ -314,7 +300,8 @@ def forward(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
# (5) ToMe u_c
hidden_states = u_c(attn_output) + hidden_states
@@ -322,9 +309,7 @@ def forward(
norm_hidden_states = self.norm3(hidden_states)
if self.use_ada_layer_norm_zero:
- norm_hidden_states = (
- norm_hidden_states *
- (1 + scale_mlp[:, None]) + shift_mlp[:, None])
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
# (6) ToMe m_m
norm_hidden_states = m_m(norm_hidden_states)
@@ -353,8 +338,7 @@ def hook(module, args):
@patch_to([DiffusionPipeline, nn.Layer])
-def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline],
- only_return_self: bool=True):
+def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], only_return_self: bool = True):
"""Removes a patch from a ToMeXXX module if it was already patched."""
model_list = []
if isinstance(model_or_pipe, DiffusionPipeline):
@@ -385,15 +369,16 @@ def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline],
@patch_to([DiffusionPipeline, nn.Layer])
def apply_tome(
- model_or_pipe: Union[nn.Layer, DiffusionPipeline],
- ratio: float=0.5,
- max_downsample: int=1,
- sx: int=2,
- sy: int=2,
- use_rand: bool=True,
- merge_attn: bool=True,
- merge_crossattn: bool=False,
- merge_mlp: bool=False, ):
+ model_or_pipe: Union[nn.Layer, DiffusionPipeline],
+ ratio: float = 0.5,
+ max_downsample: int = 1,
+ sx: int = 2,
+ sy: int = 2,
+ use_rand: bool = True,
+ merge_attn: bool = True,
+ merge_crossattn: bool = False,
+ merge_mlp: bool = False,
+):
"""
Patches a stable diffusion model_or_pipe with ToMe.
Apply this to the highest level stable diffusion object (i.e., it should have a .unet).
diff --git a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
index 6cc870cfb75ee..89a574fe97842 100644
--- a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
@@ -42,7 +42,7 @@ def convert_pt_to_pd(state, dtype):
if val.ndim == 2:
val = val.T
if val.ndim == 0:
- val = val.reshape((1, ))
+ val = val.reshape((1,))
new_state[b] = val.cast(dtype)
else:
print(f"We find {a} not in state_dict and we will continue!")
@@ -87,12 +87,10 @@ def save_lora(pipe_or_module, save_directory, WEIGHT_NAME=None):
if is_torch_available():
save_function = safetensors.torch.save_file
- outdict = convert_state_dict(
- convert_pd_to_pt(outdict), framework="torch")
+ outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="torch")
else:
save_function = safetensors.numpy.save_file
- outdict = convert_state_dict(
- convert_pd_to_pt(outdict), framework="numpy")
+ outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="numpy")
save_function(outdict, os.path.join(save_directory, WEIGHT_NAME))
del outdict
@@ -116,15 +114,16 @@ def set_lora(self):
@patch_to([DiffusionPipeline, nn.Layer])
def apply_lora(
- pipe_or_module,
- lora_weight_or_path=None,
- rank=4,
- alpha=None,
- multiplier=1.0,
- text_encoder_target_replace_modules=["TransformerEncoderLayer"],
- unet_target_replace_modules=["Transformer2DModel", "Attention"],
- enable_lora=True,
- **kwargs, ):
+ pipe_or_module,
+ lora_weight_or_path=None,
+ rank=4,
+ alpha=None,
+ multiplier=1.0,
+ text_encoder_target_replace_modules=["TransformerEncoderLayer"],
+ unet_target_replace_modules=["Transformer2DModel", "Attention"],
+ enable_lora=True,
+ **kwargs,
+):
resume_download = kwargs.pop("resume_download", False)
force_download = kwargs.pop("force_download", False)
paddle_dtype = kwargs.pop("paddle_dtype", None)
@@ -143,17 +142,16 @@ def apply_lora(
lora_weight_or_path = str(lora_weight_or_path)
if os.path.isfile(lora_weight_or_path):
lora_weight_or_path = lora_weight_or_path
- elif lora_weight_or_path.startswith(
- "http://") or lora_weight_or_path.startswith("https://"):
+ elif lora_weight_or_path.startswith("http://") or lora_weight_or_path.startswith("https://"):
lora_weight_or_path = ppdiffusers_url_download(
lora_weight_or_path,
cache_dir=cache_dir,
resume_download=resume_download,
- force_download=force_download, )
+ force_download=force_download,
+ )
else:
raise EnvironmentError(f"Please check your {lora_weight_or_path}.")
- lora_weight_or_path = convert_pt_to_pd(
- smart_load(lora_weight_or_path), paddle_dtype)
+ lora_weight_or_path = convert_pt_to_pd(smart_load(lora_weight_or_path), paddle_dtype)
mayberanklist = []
maybealphalist = []
@@ -176,67 +174,64 @@ def apply_lora(
if len(mayberanklist) > 20:
break
if len(set(mayberanklist)) > 1:
- print(
- f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}."
- )
+ print(f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}.")
else:
rank = mayberanklist[0]
print(f"|---------------Currently, rank is {rank}!")
if len(set(maybealphalist)) > 1:
- print(
- f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}"
- )
+ print(f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}")
else:
alpha = maybealphalist[0]
print(f"|---------------Currently, alpha is {alpha}!")
waitlist = []
if isinstance(pipe_or_module, nn.Layer):
- waitlist.append((
- pipe_or_module,
- text_encoder_target_replace_modules + unet_target_replace_modules,
- ))
+ waitlist.append(
+ (
+ pipe_or_module,
+ text_encoder_target_replace_modules + unet_target_replace_modules,
+ )
+ )
else:
if hasattr(pipe_or_module, "text_encoder"):
- waitlist.append((pipe_or_module.text_encoder,
- text_encoder_target_replace_modules))
+ waitlist.append((pipe_or_module.text_encoder, text_encoder_target_replace_modules))
if hasattr(pipe_or_module, "unet"):
waitlist.append((pipe_or_module.unet, unet_target_replace_modules))
lora_modules = {}
for each_module, target_replace_modules in waitlist:
for name1, module in each_module.named_sublayers(include_self=True):
if module.__class__.__name__ in target_replace_modules:
- for name2, child_module in module.named_sublayers(
- include_self=True):
+ for name2, child_module in module.named_sublayers(include_self=True):
if not getattr(child_module, "is_lora_linear", False) and (
- child_module.__class__.__name__ == "Linear" or
- (child_module.__class__.__name__ == "Conv2D" and
- list(child_module._kernel_size) == [1, 1])):
+ child_module.__class__.__name__ == "Linear"
+ or (child_module.__class__.__name__ == "Conv2D" and list(child_module._kernel_size) == [1, 1])
+ ):
# if we apply lora multi
- if hasattr(child_module,
- "merged") and child_module.merged:
+ if hasattr(child_module, "merged") and child_module.merged:
with paddle.no_grad():
if child_module.is_conv:
new_weight = (
- child_module.weight.squeeze([-1, -2]) -
- child_module.lora_up.weight.squeeze(
- [-1, -2])
- @child_module.lora_down.weight.squeeze(
- [-1, -2]) * child_module.multiplier
- * child_module.scale).unsqueeze(
- [-1, -2])
+ child_module.weight.squeeze([-1, -2])
+ - child_module.lora_up.weight.squeeze([-1, -2])
+ @ child_module.lora_down.weight.squeeze([-1, -2])
+ * child_module.multiplier
+ * child_module.scale
+ ).unsqueeze([-1, -2])
else:
- new_weight = (child_module.weight -
- child_module.lora_down.weight
- @child_module.lora_up.weight *
- child_module.multiplier *
- child_module.scale)
+ new_weight = (
+ child_module.weight
+ - child_module.lora_down.weight
+ @ child_module.lora_up.weight
+ * child_module.multiplier
+ * child_module.scale
+ )
child_module.weight.set_value(new_weight)
in_features, out_features = (
child_module.weight.shape[0],
- child_module.weight.shape[1], )
+ child_module.weight.shape[1],
+ )
child_module.is_conv = False
child_module.merged = False
@@ -250,15 +245,11 @@ def apply_lora(
)
if child_module.is_conv:
- child_module.lora_down = nn.Conv2D(
- in_features, rank, [1, 1], bias_attr=False)
- child_module.lora_up = nn.Conv2D(
- rank, out_features, [1, 1], bias_attr=False)
+ child_module.lora_down = nn.Conv2D(in_features, rank, [1, 1], bias_attr=False)
+ child_module.lora_up = nn.Conv2D(rank, out_features, [1, 1], bias_attr=False)
else:
- child_module.lora_down = nn.Linear(
- in_features, rank, bias_attr=False)
- child_module.lora_up = nn.Linear(
- rank, out_features, bias_attr=False)
+ child_module.lora_down = nn.Linear(in_features, rank, bias_attr=False)
+ child_module.lora_up = nn.Linear(rank, out_features, bias_attr=False)
child_module.lora_down.is_lora_linear = True
child_module.lora_up.is_lora_linear = True
child_module.rank = rank
@@ -268,13 +259,10 @@ def apply_lora(
alpha = alpha.detach().cast("float32").numpy()
alpha = rank if alpha is None or alpha == 0 else alpha
child_module.scale = alpha / child_module.rank
- child_module.register_buffer(
- "alpha", paddle.to_tensor(
- alpha, dtype="float32"))
+ child_module.register_buffer("alpha", paddle.to_tensor(alpha, dtype="float32"))
# same as microsoft's
- kaiming_uniform_(
- child_module.lora_down.weight, a=math.sqrt(5))
+ kaiming_uniform_(child_module.lora_down.weight, a=math.sqrt(5))
zeros_(child_module.lora_up.weight)
child_module.multiplier = multiplier
@@ -287,44 +275,47 @@ def forward_lora(self, x):
with paddle.no_grad():
if self.is_conv:
new_weight = (
- self.weight.squeeze([-1, -2]) -
- self.lora_up.weight.squeeze(
- [-1, -2])
- @self.lora_down.weight.squeeze(
- [-1, -2]) * self.multiplier
- * self.scale).unsqueeze(
- [-1, -2])
+ self.weight.squeeze([-1, -2])
+ - self.lora_up.weight.squeeze([-1, -2])
+ @ self.lora_down.weight.squeeze([-1, -2])
+ * self.multiplier
+ * self.scale
+ ).unsqueeze([-1, -2])
else:
new_weight = (
- self.weight -
- self.lora_down.weight
- @self.lora_up.weight *
- self.multiplier * self.scale)
+ self.weight
+ - self.lora_down.weight
+ @ self.lora_up.weight
+ * self.multiplier
+ * self.scale
+ )
self.weight.set_value(new_weight)
self.merged = False
if not self.enable_lora:
return self.raw_forward(x)
- return (self.raw_forward(x) +
- self.lora_up(self.lora_down(x)) *
- self.multiplier * self.scale)
+ return (
+ self.raw_forward(x)
+ + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+ )
else:
if self.enable_lora and not self.merged:
with paddle.no_grad():
if self.is_conv:
new_weight = (
- self.weight.squeeze([-1, -2]) +
- self.lora_up.weight.squeeze(
- [-1, -2])
- @self.lora_down.weight.squeeze(
- [-1, -2]) * self.multiplier
- * self.scale).unsqueeze(
- [-1, -2])
+ self.weight.squeeze([-1, -2])
+ + self.lora_up.weight.squeeze([-1, -2])
+ @ self.lora_down.weight.squeeze([-1, -2])
+ * self.multiplier
+ * self.scale
+ ).unsqueeze([-1, -2])
else:
new_weight = (
- self.weight +
- self.lora_down.weight
- @self.lora_up.weight *
- self.multiplier * self.scale)
+ self.weight
+ + self.lora_down.weight
+ @ self.lora_up.weight
+ * self.multiplier
+ * self.scale
+ )
self.weight.set_value(new_weight)
self.merged = True
@@ -332,25 +323,25 @@ def forward_lora(self, x):
with paddle.no_grad():
if self.is_conv:
new_weight = (
- self.weight.squeeze([-1, -2]) -
- self.lora_up.weight.squeeze(
- [-1, -2])
- @self.lora_down.weight.squeeze(
- [-1, -2]) * self.multiplier
- * self.scale).unsqueeze(
- [-1, -2])
+ self.weight.squeeze([-1, -2])
+ - self.lora_up.weight.squeeze([-1, -2])
+ @ self.lora_down.weight.squeeze([-1, -2])
+ * self.multiplier
+ * self.scale
+ ).unsqueeze([-1, -2])
else:
new_weight = (
- self.weight -
- self.lora_down.weight
- @self.lora_up.weight *
- self.multiplier * self.scale)
+ self.weight
+ - self.lora_down.weight
+ @ self.lora_up.weight
+ * self.multiplier
+ * self.scale
+ )
self.weight.set_value(new_weight)
self.merged = False
return self.raw_forward(x)
- child_module.forward = MethodType(forward_lora,
- child_module)
+ child_module.forward = MethodType(forward_lora, child_module)
child_module.lora_down.training = child_module.training
child_module.lora_up.training = child_module.training
child_module.to(dtype=paddle_dtype)
diff --git a/ppdiffusers/ppdiffusers/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipeline_utils.py
index 4ddfca40ac392..48a455def8412 100644
--- a/ppdiffusers/ppdiffusers/pipeline_utils.py
+++ b/ppdiffusers/ppdiffusers/pipeline_utils.py
@@ -18,4 +18,4 @@
# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
from .pipelines import ImagePipelineOutput # noqa: F401
-from .pipelines import DiffusionPipeline, TextPipelineOutput
+from .pipelines import DiffusionPipeline, TextPipelineOutput # noqa: F401
diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py
index 3c7b73e5fcf47..db10dd5dccfe7 100644
--- a/ppdiffusers/ppdiffusers/pipelines/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py
@@ -13,10 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ..utils import (OptionalDependencyNotAvailable, is_einops_available,
- is_fastdeploy_available, is_k_diffusion_available,
- is_librosa_available, is_note_seq_available,
- is_paddle_available, is_paddlenlp_available)
+from ..utils import (
+ OptionalDependencyNotAvailable,
+ is_einops_available,
+ is_fastdeploy_available,
+ is_k_diffusion_available,
+ is_librosa_available,
+ is_note_seq_available,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
try:
if not is_paddle_available():
@@ -30,8 +36,12 @@
from .dit import DiTPipeline
from .latent_diffusion import LDMSuperResolutionPipeline
from .latent_diffusion_uncond import LDMPipeline
- from .pipeline_utils import (AudioPipelineOutput, DiffusionPipeline,
- ImagePipelineOutput, TextPipelineOutput)
+ from .pipeline_utils import (
+ AudioPipelineOutput,
+ DiffusionPipeline,
+ ImagePipelineOutput,
+ TextPipelineOutput,
+ )
from .pndm import PNDMPipeline
from .repaint import RePaintPipeline
from .score_sde_ve import ScoreSdeVePipeline
@@ -51,38 +61,52 @@
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403
else:
- from .alt_diffusion import (AltDiffusionImg2ImgPipeline,
- AltDiffusionPipeline)
+ from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
from .audioldm import AudioLDMPipeline
from .deepfloyd_if import (
- IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline,
- IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, IFPipeline,
- IFSuperResolutionPipeline)
+ IFImg2ImgPipeline,
+ IFImg2ImgSuperResolutionPipeline,
+ IFInpaintingPipeline,
+ IFInpaintingSuperResolutionPipeline,
+ IFPipeline,
+ IFSuperResolutionPipeline,
+ )
from .latent_diffusion import LDMTextToImagePipeline
from .lvdm import LVDMTextToVideoPipeline, LVDMUncondPipeline
from .paint_by_example import PaintByExamplePipeline
from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
from .stable_diffusion import (
- CycleDiffusionPipeline, StableDiffusionAdapterPipeline,
+ CycleDiffusionPipeline,
+ StableDiffusionAdapterPipeline,
StableDiffusionAttendAndExcitePipeline,
- StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline,
- StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline,
- StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy,
+ StableDiffusionControlNetPipeline,
+ StableDiffusionDepth2ImgPipeline,
+ StableDiffusionImageVariationPipeline,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionInpaintPipelineLegacy,
StableDiffusionInstructPix2PixPipeline,
- StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline,
- StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline,
- StableDiffusionPipeline, StableDiffusionPipelineAllinOne,
- StableDiffusionPix2PixZeroPipeline, StableDiffusionSAGPipeline,
- StableDiffusionUpscalePipeline, StableUnCLIPImg2ImgPipeline,
- StableUnCLIPPipeline)
+ StableDiffusionLatentUpscalePipeline,
+ StableDiffusionMegaPipeline,
+ StableDiffusionModelEditingPipeline,
+ StableDiffusionPanoramaPipeline,
+ StableDiffusionPipeline,
+ StableDiffusionPipelineAllinOne,
+ StableDiffusionPix2PixZeroPipeline,
+ StableDiffusionSAGPipeline,
+ StableDiffusionUpscalePipeline,
+ StableUnCLIPImg2ImgPipeline,
+ StableUnCLIPPipeline,
+ )
from .stable_diffusion_safe import StableDiffusionPipelineSafe
- from .text_to_video_synthesis import (TextToVideoSDPipeline,
- TextToVideoZeroPipeline)
+ from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
- from .versatile_diffusion import (VersatileDiffusionDualGuidedPipeline,
- VersatileDiffusionImageVariationPipeline,
- VersatileDiffusionPipeline,
- VersatileDiffusionTextToImagePipeline)
+ from .versatile_diffusion import (
+ VersatileDiffusionDualGuidedPipeline,
+ VersatileDiffusionImageVariationPipeline,
+ VersatileDiffusionPipeline,
+ VersatileDiffusionTextToImagePipeline,
+ )
from .vq_diffusion import VQDiffusionPipeline
try:
@@ -91,12 +115,13 @@
except OptionalDependencyNotAvailable:
from ..utils.dummy_fastdeploy_objects import * # noqa F403
else:
- from .fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+ from .fastdeploy_utils import (
+ FastDeployDiffusionPipelineMixin,
+ FastDeployRuntimeModel,
+ )
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_fastdeploy_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403
@@ -110,11 +135,11 @@
FastDeployStableDiffusionInpaintPipelineLegacy,
FastDeployStableDiffusionMegaPipeline,
FastDeployStableDiffusionPipeline,
- FastDeployStableDiffusionUpscalePipeline)
+ FastDeployStableDiffusionUpscalePipeline,
+ )
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_k_diffusion_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import * # noqa F403
@@ -122,8 +147,7 @@
from .stable_diffusion import StableDiffusionKDiffusionPipeline
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_einops_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_paddlenlp_and_einops_objects import * # noqa F403
@@ -131,11 +155,9 @@
from .unidiffuser import UniDiffuserPipeline
try:
- if not (is_paddle_available() and is_paddlenlp_available() and
- is_note_seq_available()):
+ if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import * # noqa F403
else:
- from .spectrogram_diffusion import (MidiProcessor,
- SpectrogramDiffusionPipeline)
+ from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
index 087da16f84c37..70cd40778b488 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
@@ -24,9 +24,7 @@
from paddlenlp.transformers.model_outputs import ModelOutput
-def create_position_ids_from_input_ids(input_ids,
- padding_idx,
- past_key_values_length=0):
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
@@ -38,8 +36,7 @@ def create_position_ids_from_input_ids(input_ids,
"""
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
mask = (input_ids != padding_idx).cast("int64")
- incremental_indices = (paddle.cumsum(
- mask, axis=1) + past_key_values_length) * mask
+ incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask
return incremental_indices + padding_idx
@@ -76,21 +73,23 @@ class RobertaSeriesConfig(XLMRobertaConfig):
model_type = "roberta"
def __init__(
- self,
- pad_token_id=1,
- bos_token_id=0,
- eos_token_id=2,
- project_dim=512,
- pooler_fn="cls",
- learn_encoder=False,
- use_attention_mask=True,
- **kwargs, ):
+ self,
+ pad_token_id=1,
+ bos_token_id=0,
+ eos_token_id=2,
+ project_dim=512,
+ pooler_fn="cls",
+ learn_encoder=False,
+ use_attention_mask=True,
+ **kwargs,
+ ):
kwargs["return_dict"] = kwargs.pop("return_dict", True)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
- **kwargs, )
+ **kwargs,
+ )
self.project_dim = project_dim
self.pooler_fn = pooler_fn
self.learn_encoder = learn_encoder
@@ -99,9 +98,7 @@ def __init__(
class RobertaSeriesModelWithTransformation(RobertaPretrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
- _keys_to_ignore_on_load_missing = [
- r"position_ids", r"predictions.decoder.bias"
- ]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
base_model_prefix = "roberta"
config_class = RobertaSeriesConfig
@@ -111,39 +108,35 @@ def __init__(self, config: RobertaSeriesConfig):
# must reset _padding_idx
self.roberta.embeddings.word_embeddings._padding_idx = None
self.transformation = nn.Linear(config.hidden_size, config.project_dim)
- self.has_pre_transformation = getattr(config, "has_pre_transformation",
- False)
+ self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
if self.has_pre_transformation:
- self.transformation_pre = nn.Linear(config.hidden_size,
- config.project_dim)
- self.pre_LN = nn.LayerNorm(
- config.hidden_size, eps=config.layer_norm_eps)
+ self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
+ self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.init_weights()
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- token_type_ids: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- return_dict: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None, ):
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ token_type_ids: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ ):
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if position_ids is None:
- position_ids = create_position_ids_from_input_ids(
- input_ids, self.config.pad_token_id)
+ position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
outputs = self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
output_attentions=output_attentions,
- output_hidden_states=True
- if self.has_pre_transformation else output_hidden_states,
- return_dict=return_dict, )
+ output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
+ return_dict=return_dict,
+ )
if self.has_pre_transformation:
sequence_output2 = outputs["hidden_states"][-2]
@@ -154,11 +147,13 @@ def forward(
projection_state=projection_state2,
last_hidden_state=outputs.last_hidden_state,
hidden_states=outputs.hidden_states,
- attentions=outputs.attentions, )
+ attentions=outputs.attentions,
+ )
else:
projection_state = self.transformation(outputs.last_hidden_state)
return TransformationModelOutput(
projection_state=projection_state,
last_hidden_state=outputs.last_hidden_state,
hidden_states=outputs.hidden_states,
- attentions=outputs.attentions, )
+ attentions=outputs.attentions,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index a610e38dbd5ac..0dee82d33981b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -24,8 +24,7 @@
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
@@ -85,37 +84,33 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: RobertaSeriesModelWithTransformation,
- tokenizer: XLMRobertaTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: RobertaSeriesModelWithTransformation,
+ tokenizer: XLMRobertaTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -123,11 +118,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -148,12 +139,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -164,12 +153,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -181,18 +167,20 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -232,29 +220,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -262,8 +252,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -273,21 +262,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -295,46 +285,42 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -353,53 +339,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -412,22 +394,25 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -446,25 +431,25 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -545,7 +530,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -567,7 +553,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -582,43 +569,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -631,8 +613,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
@@ -641,11 +622,9 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return AltDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 313c4e5e2eca1..232d79d8da99a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -27,8 +27,13 @@
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ PIL_INTERPOLATION,
+ deprecate,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
@@ -74,11 +79,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -90,8 +91,7 @@ def preprocess(image):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
-class AltDiffusionImg2ImgPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-guided image to image generation using Alt Diffusion.
@@ -128,37 +128,33 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: RobertaSeriesModelWithTransformation,
- tokenizer: XLMRobertaTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: RobertaSeriesModelWithTransformation,
+ tokenizer: XLMRobertaTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -166,11 +162,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -191,12 +183,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -207,12 +197,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -224,21 +211,23 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
- self.image_processor = VaeImageProcessor(
- vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.register_to_config(
- requires_safety_checker=requires_safety_checker, )
+ requires_safety_checker=requires_safety_checker,
+ )
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -278,29 +267,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -308,8 +299,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -319,21 +309,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -341,36 +332,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -379,17 +367,14 @@ def run_safety_checker(self, image, dtype):
has_nsfw_concept = None
else:
if paddle.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(
- image, output_type="pil")
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
- feature_extractor_input = self.image_processor.numpy_to_pil(
- image)
- safety_checker_input = self.feature_extractor(
- feature_extractor_input, return_tensors="pd")
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=paddle.cast(safety_checker_input.pixel_values,
- dtype), )
+ clip_input=paddle.cast(safety_checker_input.pixel_values, dtype),
+ )
return image, has_nsfw_concept
def decode_latents(self, latents):
@@ -404,51 +389,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- strength,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -461,25 +443,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -496,8 +472,7 @@ def prepare_latents(self,
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -505,8 +480,7 @@ def prepare_latents(self,
init_latents = self.vae.config.scaling_factor * init_latents
- if (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] == 0):
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -518,12 +492,11 @@ def prepare_latents(self,
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // init_latents.shape[0]
- init_latents = paddle.concat(
- [init_latents] * additional_image_per_prompt, axis=0)
- elif (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] != 0):
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
@@ -542,24 +515,24 @@ def prepare_latents(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -636,7 +609,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -657,17 +631,16 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Preprocess image
image = self.image_processor.preprocess(image)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- (batch_size * num_images_per_prompt, ))
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile((batch_size * num_images_per_prompt,))
# 6. Prepare latent variables
latents = self.prepare_latents(
@@ -676,51 +649,45 @@ def __call__(
batch_size,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
else:
image = latents
has_nsfw_concept = None
@@ -730,11 +697,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return AltDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
index ffe2c5bad7456..ca098c706711c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
@@ -25,7 +25,9 @@
_import_error = ""
except Exception as e:
_librosa_can_be_imported = False
- _import_error = f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+ _import_error = (
+ f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+ )
from PIL import Image # noqa: E402
@@ -46,14 +48,15 @@ class Mel(ConfigMixin, SchedulerMixin):
@register_to_config
def __init__(
- self,
- x_res: int=256,
- y_res: int=256,
- sample_rate: int=22050,
- n_fft: int=2048,
- hop_length: int=512,
- top_db: int=80,
- n_iter: int=32, ):
+ self,
+ x_res: int = 256,
+ y_res: int = 256,
+ sample_rate: int = 22050,
+ n_fft: int = 2048,
+ hop_length: int = 512,
+ top_db: int = 80,
+ n_iter: int = 32,
+ ):
self.hop_length = hop_length
self.sr = sample_rate
self.n_fft = n_fft
@@ -77,7 +80,7 @@ def set_resolution(self, x_res: int, y_res: int):
self.n_mels = self.y_res
self.slice_size = self.x_res * self.hop_length - 1
- def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None):
+ def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
"""Load audio.
Args:
@@ -91,10 +94,12 @@ def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None):
# Pad with silence if necessary.
if len(self.audio) < self.x_res * self.hop_length:
- self.audio = np.concatenate([
- self.audio,
- np.zeros((self.x_res * self.hop_length - len(self.audio), )),
- ])
+ self.audio = np.concatenate(
+ [
+ self.audio,
+ np.zeros((self.x_res * self.hop_length - len(self.audio),)),
+ ]
+ )
def get_number_of_slices(self) -> int:
"""Get number of slices in audio.
@@ -104,7 +109,7 @@ def get_number_of_slices(self) -> int:
"""
return len(self.audio) // self.slice_size
- def get_audio_slice(self, slice: int=0) -> np.ndarray:
+ def get_audio_slice(self, slice: int = 0) -> np.ndarray:
"""Get slice of audio.
Args:
@@ -113,7 +118,7 @@ def get_audio_slice(self, slice: int=0) -> np.ndarray:
Returns:
`np.ndarray`: audio as numpy array
"""
- return self.audio[self.slice_size * slice:self.slice_size * (slice + 1)]
+ return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
def get_sample_rate(self) -> int:
"""Get sample rate:
@@ -137,11 +142,10 @@ def audio_slice_to_image(self, slice: int) -> Image.Image:
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
- n_mels=self.n_mels, )
+ n_mels=self.n_mels,
+ )
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
- bytedata = ((
- (log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5
- ).astype(np.uint8)
+ bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
image = Image.fromarray(bytedata)
return image
@@ -154,8 +158,7 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
Returns:
audio (`np.ndarray`): raw audio
"""
- bytedata = np.frombuffer(
- image.tobytes(), dtype="uint8").reshape((image.height, image.width))
+ bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
S = librosa.db_to_power(log_S)
audio = librosa.feature.inverse.mel_to_audio(
@@ -163,5 +166,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
- n_iter=self.n_iter, )
+ n_iter=self.n_iter,
+ )
return audio
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
index 50b57cd936dac..581729f066b72 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
@@ -23,8 +23,12 @@
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import DDIMScheduler, DDPMScheduler
from ...utils import randn_tensor
-from ..pipeline_utils import (AudioPipelineOutput, BaseOutput,
- DiffusionPipeline, ImagePipelineOutput)
+from ..pipeline_utils import (
+ AudioPipelineOutput,
+ BaseOutput,
+ DiffusionPipeline,
+ ImagePipelineOutput,
+)
from .mel import Mel
@@ -43,14 +47,14 @@ class AudioDiffusionPipeline(DiffusionPipeline):
_optional_components = ["vqvae"]
def __init__(
- self,
- vqvae: AutoencoderKL,
- unet: UNet2DConditionModel,
- mel: Mel,
- scheduler: Union[DDIMScheduler, DDPMScheduler], ):
+ self,
+ vqvae: AutoencoderKL,
+ unet: UNet2DConditionModel,
+ mel: Mel,
+ scheduler: Union[DDIMScheduler, DDPMScheduler],
+ ):
super().__init__()
- self.register_modules(
- unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
+ self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
def get_input_dims(self) -> Tuple:
"""Returns dimension of input image
@@ -62,8 +66,9 @@ def get_input_dims(self) -> Tuple:
# For backwards compatibility
sample_size = (
(input_module.config.sample_size, input_module.config.sample_size)
- if type(input_module.config.sample_size) == int else
- input_module.config.sample_size)
+ if type(input_module.config.sample_size) == int
+ else input_module.config.sample_size
+ )
return sample_size
def get_default_steps(self) -> int:
@@ -76,23 +81,25 @@ def get_default_steps(self) -> int:
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- audio_file: str=None,
- raw_audio: np.ndarray=None,
- slice: int=0,
- start_step: int=0,
- steps: int=None,
- generator: paddle.Generator=None,
- mask_start_secs: float=0,
- mask_end_secs: float=0,
- step_generator: paddle.Generator=None,
- eta: float=0,
- noise: paddle.Tensor=None,
- encoding: paddle.Tensor=None,
- return_dict=True, ) -> Union[Union[
- AudioPipelineOutput, ImagePipelineOutput], Tuple[List[
- Image.Image], Tuple[int, List[np.ndarray]]], ]:
+ self,
+ batch_size: int = 1,
+ audio_file: str = None,
+ raw_audio: np.ndarray = None,
+ slice: int = 0,
+ start_step: int = 0,
+ steps: int = None,
+ generator: paddle.Generator = None,
+ mask_start_secs: float = 0,
+ mask_end_secs: float = 0,
+ step_generator: paddle.Generator = None,
+ eta: float = 0,
+ noise: paddle.Tensor = None,
+ encoding: paddle.Tensor = None,
+ return_dict=True,
+ ) -> Union[
+ Union[AudioPipelineOutput, ImagePipelineOutput],
+ Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
+ ]:
"""Generate random mel spectrogram from audio input and convert to audio.
Args:
@@ -122,7 +129,8 @@ def __call__(
if type(self.unet.config.sample_size) == int:
self.unet.config.sample_size = (
self.unet.config.sample_size,
- self.unet.config.sample_size, )
+ self.unet.config.sample_size,
+ )
input_dims = self.get_input_dims()
self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0])
if noise is None:
@@ -131,44 +139,43 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size[0],
- self.unet.config.sample_size[1], ),
- generator=generator, )
+ self.unet.config.sample_size[1],
+ ),
+ generator=generator,
+ )
images = noise
mask = None
if audio_file is not None or raw_audio is not None:
self.mel.load_audio(audio_file, raw_audio)
input_image = self.mel.audio_slice_to_image(slice)
- input_image = np.frombuffer(
- input_image.tobytes(), dtype="uint8").reshape(
- (input_image.height, input_image.width))
+ input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
+ (input_image.height, input_image.width)
+ )
input_image = (input_image / 255) * 2 - 1
- input_images = paddle.to_tensor(
- input_image[np.newaxis, :, :], dtype=paddle.float32)
+ input_images = paddle.to_tensor(input_image[np.newaxis, :, :], dtype=paddle.float32)
if self.vqvae is not None:
- input_images = self.vqvae.encode(
- paddle.unsqueeze(input_images, 0)).latent_dist.sample(
- generator=generator)[0]
+ input_images = self.vqvae.encode(paddle.unsqueeze(input_images, 0)).latent_dist.sample(
+ generator=generator
+ )[0]
input_images = self.vqvae.config.scaling_factor * input_images
if start_step > 0:
- images[0, 0] = self.scheduler.add_noise(
- input_images, noise,
- self.scheduler.timesteps[start_step - 1])
+ images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
- pixels_per_second = (self.unet.config.sample_size[1] *
- self.mel.get_sample_rate() / self.mel.x_res /
- self.mel.hop_length)
+ pixels_per_second = (
+ self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
+ )
mask_start = int(mask_start_secs * pixels_per_second)
mask_end = int(mask_end_secs * pixels_per_second)
mask = self.scheduler.add_noise(
input_images,
noise,
- paddle.to_tensor(self.scheduler.timesteps[start_step:]), )
+ paddle.to_tensor(self.scheduler.timesteps[start_step:]),
+ )
- for step, t in enumerate(
- self.progress_bar(self.scheduler.timesteps[start_step:])):
+ for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
if isinstance(self.unet, UNet2DConditionModel):
model_output = self.unet(images, t, encoding)["sample"]
else:
@@ -180,13 +187,15 @@ def __call__(
timestep=t,
sample=images,
eta=eta,
- generator=step_generator, )["prev_sample"]
+ generator=step_generator,
+ )["prev_sample"]
else:
images = self.scheduler.step(
model_output=model_output,
timestep=t,
sample=images,
- generator=step_generator, )["prev_sample"]
+ generator=step_generator,
+ )["prev_sample"]
if mask is not None:
if mask_start > 0:
@@ -202,20 +211,20 @@ def __call__(
images = (images / 2 + 0.5).clip(0, 1)
images = images.transpose([0, 2, 3, 1]).cast("float32").numpy()
images = (images * 255).round().astype("uint8")
- images = list((Image.fromarray(_[:, :, 0]) for _ in images)
- if images.shape[3] == 1 else (Image.fromarray(
- _, mode="RGB").convert("L") for _ in images))
+ images = list(
+ (Image.fromarray(_[:, :, 0]) for _ in images)
+ if images.shape[3] == 1
+ else (Image.fromarray(_, mode="RGB").convert("L") for _ in images)
+ )
audios = [self.mel.image_to_audio(_) for _ in images]
if not return_dict:
return images, (self.mel.get_sample_rate(), audios)
- return BaseOutput(
- **AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]),
- **ImagePipelineOutput(images))
+ return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
@paddle.no_grad()
- def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray:
+ def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
"""Reverse step process: recover noisy image from generated image.
Args:
@@ -229,36 +238,30 @@ def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray:
# Only works with DDIM as this method is deterministic
assert isinstance(self.scheduler, DDIMScheduler)
self.scheduler.set_timesteps(steps)
- sample = np.array([
- np.frombuffer(
- image.tobytes(), dtype="uint8").reshape(
- (1, image.height, image.width)) for image in images
- ])
+ sample = np.array(
+ [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
+ )
sample = (sample / 255) * 2 - 1
sample = paddle.to_tensor(sample)
- for t in self.progress_bar(
- paddle.flip(self.scheduler.timesteps, (0, ))):
- prev_timestep = (t - self.scheduler.num_train_timesteps //
- self.scheduler.num_inference_steps)
+ for t in self.progress_bar(paddle.flip(self.scheduler.timesteps, (0,))):
+ prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps
alpha_prod_t = self.scheduler.alphas_cumprod[t]
- alpha_prod_t_prev = (self.scheduler.alphas_cumprod[prev_timestep]
- if prev_timestep >= 0 else
- self.scheduler.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ self.scheduler.alphas_cumprod[prev_timestep]
+ if prev_timestep >= 0
+ else self.scheduler.final_alpha_cumprod
+ )
beta_prod_t = 1 - alpha_prod_t
model_output = self.unet(sample, t)["sample"]
- pred_sample_direction = (1 - alpha_prod_t_prev)**(
- 0.5) * model_output
- sample = (sample - pred_sample_direction) * alpha_prod_t_prev**(
- -0.5)
- sample = (sample * alpha_prod_t**(0.5) + beta_prod_t**
- (0.5) * model_output)
+ pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
+ sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
+ sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
return sample
@staticmethod
- def slerp(x0: paddle.Tensor, x1: paddle.Tensor,
- alpha: float) -> paddle.Tensor:
+ def slerp(x0: paddle.Tensor, x1: paddle.Tensor, alpha: float) -> paddle.Tensor:
"""Spherical Linear intERPolation
Args:
@@ -270,8 +273,5 @@ def slerp(x0: paddle.Tensor, x1: paddle.Tensor,
`paddle.Tensor`: interpolated tensor
"""
- theta = acos(
- paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) /
- paddle.norm(x0) / paddle.norm(x1))
- return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(
- alpha * theta) * x1 / sin(theta)
+ theta = acos(paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) / paddle.norm(x0) / paddle.norm(x1))
+ return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
index 87a892da4d792..4ab25efc20003 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
@@ -12,12 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
- is_paddlenlp_available, is_paddlenlp_version)
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+ is_paddlenlp_version,
+)
try:
- if not (is_paddlenlp_available() and is_paddle_available() and
- is_paddlenlp_version(">=", "2.5.2")):
+ if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
index 0ba945ffdf429..8354d5e18ad8b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -18,8 +18,11 @@
import numpy as np
import paddle
import paddle.nn.functional as F
-from paddlenlp.transformers import (ClapTextModelWithProjection,
- RobertaTokenizer, SpeechT5HifiGan)
+from paddlenlp.transformers import (
+ ClapTextModelWithProjection,
+ RobertaTokenizer,
+ SpeechT5HifiGan,
+)
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -65,13 +68,14 @@ class AudioLDMPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: ClapTextModelWithProjection,
- tokenizer: RobertaTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- vocoder: SpeechT5HifiGan, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: ClapTextModelWithProjection,
+ tokenizer: RobertaTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ vocoder: SpeechT5HifiGan,
+ ):
super().__init__()
self.register_modules(
vae=vae,
@@ -79,17 +83,19 @@ def __init__(
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
- vocoder=vocoder, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ vocoder=vocoder,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
def _encode_prompt(
- self,
- prompt,
- num_waveforms_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_waveforms_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
"""
Encodes the prompt into text encoder hidden states.
@@ -113,13 +119,13 @@ def _encode_prompt(
argument.
"""
if self.text_encoder.text_model.embeddings.token_type_ids.dtype not in [
- paddle.int16,
- paddle.int32,
- paddle.int64,
+ paddle.int16,
+ paddle.int32,
+ paddle.int64,
]:
self.text_encoder.text_model.embeddings.token_type_ids = (
- self.text_encoder.text_model.embeddings.token_type_ids.cast(
- "int32"))
+ self.text_encoder.text_model.embeddings.token_type_ids.cast("int32")
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -134,34 +140,35 @@ def _encode_prompt(
max_length=self.tokenizer.model_max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
attention_mask = text_inputs.attention_mask
untruncated_ids = self.tokenizer(
prompt,
padding="longest",
return_tensors="pd",
- return_attention_mask=True, ).input_ids
- if (untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and
- not paddle.equal_all(
- x=text_input_ids, y=untruncated_ids).item()):
+ return_attention_mask=True,
+ ).input_ids
+ if (
+ untruncated_ids.shape[-1] >= text_input_ids.shape[-1]
+ and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
f"The following part of your input was truncated because CLAP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
)
- prompt_embeds = self.text_encoder(
- text_input_ids.cast("int32"), attention_mask=attention_mask)
+ prompt_embeds = self.text_encoder(text_input_ids.cast("int32"), attention_mask=attention_mask)
prompt_embeds = prompt_embeds.text_embeds
# additional L_2 normalization over each hidden-state
prompt_embeds = F.normalize(x=prompt_embeds, axis=-1)
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
bs_embed, seq_len = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
- prompt_embeds = prompt_embeds.tile(
- repeat_times=[1, num_waveforms_per_prompt])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_waveforms_per_prompt, seq_len])
+ prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -187,33 +194,28 @@ def _encode_prompt(
max_length=max_length,
truncation=True,
return_tensors="pd",
- return_attention_mask=True, )
+ return_attention_mask=True,
+ )
uncond_input_ids = uncond_input.input_ids
attention_mask = uncond_input.attention_mask
- negative_prompt_embeds = self.text_encoder(
- uncond_input_ids.cast("int32"), attention_mask=attention_mask)
+ negative_prompt_embeds = self.text_encoder(uncond_input_ids.cast("int32"), attention_mask=attention_mask)
negative_prompt_embeds = negative_prompt_embeds.text_embeds
# additional L_2 normalization over each hidden-state
- negative_prompt_embeds = F.normalize(
- x=negative_prompt_embeds, axis=-1)
+ negative_prompt_embeds = F.normalize(x=negative_prompt_embeds, axis=-1)
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- repeat_times=[1, num_waveforms_per_prompt])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_waveforms_per_prompt, seq_len])
+ negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- x=[negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def decode_latents(self, latents):
@@ -235,28 +237,27 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- audio_length_in_s,
- vocoder_upsample_factor,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ audio_length_in_s,
+ vocoder_upsample_factor,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
if audio_length_in_s < min_audio_length_in_s:
raise ValueError(
@@ -266,8 +267,11 @@ def check_inputs(
raise ValueError(
f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of {self.vae_scale_factor}."
)
- if (callback_steps is None or callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ if (
+ callback_steps is None
+ or callback_steps is not None
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
)
@@ -279,11 +283,8 @@ def check_inputs(
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -294,18 +295,13 @@ def check_inputs(
f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
)
- def prepare_latents(self,
- batch_size,
- num_channels_latents,
- height,
- dtype,
- generator,
- latents=None):
+ def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- self.vocoder.config.model_in_dim // self.vae_scale_factor, )
+ self.vocoder.config.model_in_dim // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -322,24 +318,24 @@ def prepare_latents(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- audio_length_in_s: Optional[float]=None,
- num_inference_steps: int=10,
- guidance_scale: float=2.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_waveforms_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- output_type: Optional[str]="np", ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ audio_length_in_s: Optional[float] = None,
+ num_inference_steps: int = 10,
+ guidance_scale: float = 2.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_waveforms_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ output_type: Optional[str] = "np",
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -406,18 +402,13 @@ def __call__(
When returning a tuple, the first element is a list with the generated audios.
"""
# 0. Convert audio input length from seconds to spectrogram height
- vocoder_upsample_factor = (np.prod(self.vocoder.config.upsample_rates) /
- self.vocoder.config.sampling_rate)
+ vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
if audio_length_in_s is None:
- audio_length_in_s = (self.unet.config.sample_size *
- self.vae_scale_factor *
- vocoder_upsample_factor)
+ audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
height = int(audio_length_in_s / vocoder_upsample_factor)
- original_waveform_length = int(audio_length_in_s *
- self.vocoder.config.sampling_rate)
+ original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
if height % self.vae_scale_factor != 0:
- height = (int(np.ceil(height / self.vae_scale_factor)) *
- self.vae_scale_factor)
+ height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
logger.info(
f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} so that it can be handled by the model. It will be cut to {audio_length_in_s} after the denoising process."
)
@@ -430,7 +421,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -452,7 +444,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -466,21 +459,19 @@ def __call__(
height,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(x=[latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
@@ -488,22 +479,19 @@ def __call__(
t,
encoder_hidden_states=None,
class_labels=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -519,6 +507,6 @@ def __call__(
audio = audio.numpy()
if not return_dict:
- return (audio, )
+ return (audio,)
return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index cc5f2a1b40f43..b4bc68019bf35 100644
--- a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -40,13 +40,13 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- num_inference_steps: int=100,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- audio_length_in_s: Optional[float]=None,
- return_dict: bool=True, ) -> Union[AudioPipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 100,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ audio_length_in_s: Optional[float] = None,
+ return_dict: bool = True,
+ ) -> Union[AudioPipelineOutput, Tuple]:
"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -67,18 +67,18 @@ def __call__(
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
"""
if audio_length_in_s is None:
- audio_length_in_s = (self.unet.config.sample_size /
- self.unet.config.sample_rate)
+ audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
sample_size = audio_length_in_s * self.unet.config.sample_rate
- down_scale_factor = 2**len(self.unet.up_blocks)
+ down_scale_factor = 2 ** len(self.unet.up_blocks)
if sample_size < 3 * down_scale_factor:
raise ValueError(
f"{audio_length_in_s} is too small. Make sure it's bigger or equal to {3 * down_scale_factor / self.unet.config.sample_rate}."
)
original_sample_size = int(sample_size)
if sample_size % down_scale_factor != 0:
- sample_size = (audio_length_in_s * self.unet.config.sample_rate //
- down_scale_factor + 1) * down_scale_factor
+ sample_size = (
+ audio_length_in_s * self.unet.config.sample_rate // down_scale_factor + 1
+ ) * down_scale_factor
logger.info(
f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising process."
)
@@ -105,5 +105,5 @@ def __call__(
audio = audio.clip(min=-1, max=1).astype(dtype="float32").cpu().numpy()
audio = audio[:, :, :original_sample_size]
if not return_dict:
- return (audio, )
+ return (audio,)
return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
index ee8dbc0143053..2ffd3401ceb13 100644
--- a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
+++ b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
@@ -42,15 +42,15 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- eta: float=0.0,
- num_inference_steps: int=50,
- use_clipped_model_output: Optional[bool]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ eta: float = 0.0,
+ num_inference_steps: int = 50,
+ use_clipped_model_output: Optional[bool] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[ImagePipelineOutput, Tuple]:
"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -82,19 +82,20 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, )
+ self.unet.config.sample_size,
+ )
else:
image_shape = (
batch_size,
self.unet.config.in_channels,
- *self.unet.config.sample_size, )
+ *self.unet.config.sample_size,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
)
- image = randn_tensor(
- image_shape, generator=generator, dtype=self.unet.dtype)
+ image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
# set step values
self.scheduler.set_timesteps(num_inference_steps)
@@ -112,7 +113,8 @@ def __call__(
image,
eta=eta,
use_clipped_model_output=use_clipped_model_output,
- generator=generator, ).prev_sample
+ generator=generator,
+ ).prev_sample
image = (image / 2 + 0.5).clip(min=0, max=1)
image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
@@ -120,5 +122,5 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
index cc73ea0e507a5..4ff2fe9a23bd9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -38,13 +38,13 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- num_inference_steps: int=1000,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ num_inference_steps: int = 1000,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[ImagePipelineOutput, Tuple]:
"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -70,12 +70,14 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, )
+ self.unet.config.sample_size,
+ )
else:
image_shape = (
batch_size,
self.unet.config.in_channels,
- *self.unet.config.sample_size, )
+ *self.unet.config.sample_size,
+ )
image = randn_tensor(image_shape, generator=generator)
# set step values
self.scheduler.set_timesteps(num_inference_steps)
@@ -84,12 +86,11 @@ def __call__(
model_output = self.unet(image, t).sample
# 2. compute previous image: x_t -> x_t-1
- image = self.scheduler.step(
- model_output, t, image, generator=generator).prev_sample
+ image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
image = (image / 2 + 0.5).clip(min=0, max=1)
image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
index ca49b436b3f91..fccb87f08b7b7 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
@@ -18,12 +18,22 @@
import numpy as np
import PIL
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
- is_paddle_available, is_paddlenlp_available)
-from .timesteps import (fast27_timesteps, smart27_timesteps, smart50_timesteps,
- smart100_timesteps, smart185_timesteps,
- super27_timesteps, super40_timesteps,
- super100_timesteps)
+from ...utils import (
+ BaseOutput,
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
+from .timesteps import (
+ fast27_timesteps,
+ smart27_timesteps,
+ smart50_timesteps,
+ smart100_timesteps,
+ smart185_timesteps,
+ super27_timesteps,
+ super40_timesteps,
+ super100_timesteps,
+)
@dataclass
@@ -55,11 +65,11 @@ class IFPipelineOutput(BaseOutput):
else:
from .pipeline_if import IFPipeline
from .pipeline_if_img2img import IFImg2ImgPipeline
- from .pipeline_if_img2img_superresolution import \
- IFImg2ImgSuperResolutionPipeline
+ from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
from .pipeline_if_inpainting import IFInpaintingPipeline
- from .pipeline_if_inpainting_superresolution import \
- IFInpaintingSuperResolutionPipeline
+ from .pipeline_if_inpainting_superresolution import (
+ IFInpaintingSuperResolutionPipeline,
+ )
from .pipeline_if_superresolution import IFSuperResolutionPipeline
from .safety_checker import IFSafetyChecker
from .watermark import IFWatermarker
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
index 787a25590a6e1..2a7c3bddcaedd 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -19,14 +19,19 @@
from typing import Any, Callable, Dict, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...loaders import LoraLoaderMixin
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available,
- logging, randn_tensor, replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -101,8 +106,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -113,15 +118,16 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -147,19 +153,21 @@ def __init__(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
@paddle.no_grad()
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -186,7 +194,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -199,31 +208,31 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -238,8 +247,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -252,12 +260,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -266,12 +274,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -281,10 +291,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -296,11 +304,11 @@ def encode_prompt(
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -314,46 +322,44 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -366,10 +372,10 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- def prepare_intermediate_images(self, batch_size, num_channels, height,
- width, dtype, generator):
+ def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
shape = (batch_size, num_channels, height, width)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
@@ -377,8 +383,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
- intermediate_images = randn_tensor(
- shape, generator=generator, dtype=dtype)
+ intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
# scale the initial noise by the standard deviation required by the scheduler
intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
@@ -386,14 +391,12 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -419,11 +422,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -450,7 +455,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -477,15 +483,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -503,13 +507,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -529,26 +530,26 @@ def _clean_caption(self, caption):
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- num_inference_steps: int=100,
- timesteps: List[int]=None,
- guidance_scale: float=7.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- height: Optional[int]=None,
- width: Optional[int]=None,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- clean_caption: bool=True,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ num_inference_steps: int = 100,
+ timesteps: List[int] = None,
+ guidance_scale: float = 7.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -625,7 +626,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
height = height or self.unet.config.sample_size
@@ -651,11 +653,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
# 4. Prepare timesteps
if timesteps is not None:
@@ -673,19 +675,19 @@ def __call__(
height,
width,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
- model_input = (paddle.concat([intermediate_images] * 2)
- if do_classifier_free_guidance else
- intermediate_images)
+ model_input = (
+ paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+ )
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -694,7 +696,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -704,27 +707,28 @@ def __call__(
model_input.shape[1],
noise_pred_uncond.shape[1] - model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1],
noise_pred_text.shape[1] - model_input.shape[1],
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
if self.scheduler.config.variance_type not in [
- "learned",
- "learned_range",
+ "learned",
+ "learned_range",
]:
noise_pred, _ = noise_pred.split(
[
model_input.shape[1],
noise_pred_uncond.shape[1] - model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
@@ -732,12 +736,11 @@ def __call__(
t,
intermediate_images,
**extra_step_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -750,16 +753,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
# 11. Apply watermark
if self.watermarker is not None:
- image = self.watermarker.apply_watermark(
- image, self.unet.config.sample_size)
+ image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -770,8 +771,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -779,4 +779,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index 7fa08748a3d86..30df336ebed8c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -21,14 +21,19 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
- is_ftfy_available, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ PIL_INTERPOLATION,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -55,8 +60,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
else:
h = int(round(img_size / 8 / coef) * 8)
- images = images.resize(
- (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
return images
@@ -127,8 +131,8 @@ class IFImg2ImgPipeline(DiffusionPipeline):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -139,15 +143,16 @@ class IFImg2ImgPipeline(DiffusionPipeline):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -173,20 +178,22 @@ def __init__(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
@paddle.no_grad()
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -213,7 +220,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -226,31 +234,31 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -265,8 +273,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -279,12 +286,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -293,12 +300,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -308,10 +317,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -324,11 +331,11 @@ def encode_prompt(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -342,48 +349,46 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- batch_size,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ batch_size,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -396,19 +401,23 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
if isinstance(image, list):
check_image_type = image[0]
else:
check_image_type = image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(image, list):
image_batch_size = len(image)
@@ -422,21 +431,17 @@ def check_inputs(
assert False
if batch_size != image_batch_size:
- raise ValueError(
- f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
- )
+ raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -463,11 +468,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -494,7 +501,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -521,15 +529,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -547,13 +553,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -598,35 +601,24 @@ def numpy_to_pd(images):
image = numpy_to_pd(image) # to pd
elif isinstance(image[0], np.ndarray):
- image = (np.concatenate(
- image, axis=0) if image[0].ndim == 4 else np.stack(
- image, axis=0))
+ image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
image = numpy_to_pd(image)
elif isinstance(image[0], paddle.Tensor):
- image = (paddle.concat(
- image, axis=0) if image[0].ndim == 4 else paddle.stack(
- image, axis=0))
+ image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
return image
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
return timesteps, num_inference_steps - t_start
- def prepare_intermediate_images(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
_, channels, height, width = image.shape
batch_size = batch_size * num_images_per_prompt
@@ -649,27 +641,33 @@ def prepare_intermediate_images(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
- PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
- strength: float=0.7,
- num_inference_steps: int=80,
- timesteps: List[int]=None,
- guidance_scale: float=10.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- clean_caption: bool=True,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ strength: float = 0.7,
+ num_inference_steps: int = 80,
+ timesteps: List[int] = None,
+ guidance_scale: float = 10.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -760,7 +758,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -775,11 +774,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
dtype = prompt_embeds.dtype
@@ -792,32 +791,29 @@ def __call__(
self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. Prepare intermediate images
image = self.preprocess_image(image)
image = image.cast(dtype)
noise_timestep = timesteps[0:1]
- noise_timestep = noise_timestep.tile(
- (batch_size * num_images_per_prompt, ))
+ noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
intermediate_images = self.prepare_intermediate_images(
- image, noise_timestep, batch_size, num_images_per_prompt, dtype,
- generator)
+ image, noise_timestep, batch_size, num_images_per_prompt, dtype, generator
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
- model_input = (paddle.concat([intermediate_images] * 2)
- if do_classifier_free_guidance else
- intermediate_images)
+ model_input = (
+ paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+ )
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -825,7 +821,8 @@ def __call__(
model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
@@ -835,27 +832,25 @@ def __call__(
model_input.shape[1],
noise_pred_uncond.shape[1] - model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1],
noise_pred_text.shape[1] - model_input.shape[1],
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs
+ ).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -868,16 +863,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
# 11. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image,
- self.unet.config.sample_size)
+ self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -888,8 +881,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -897,4 +889,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 42dd7fa35fa27..63e586bf00e34 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -22,14 +22,19 @@
import paddle
import paddle.nn.functional as F
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
- is_ftfy_available, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ PIL_INTERPOLATION,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
else:
h = int(round(img_size / 8 / coef) * 8)
- images = images.resize(
- (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
return images
@@ -130,8 +134,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -141,16 +145,17 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- image_noising_scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ image_noising_scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -182,20 +187,19 @@ def __init__(
image_noising_scheduler=image_noising_scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -222,11 +226,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -253,7 +259,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -280,15 +287,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -306,13 +311,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -332,14 +334,15 @@ def _clean_caption(self, caption):
@paddle.no_grad()
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -366,7 +369,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -379,31 +383,31 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -419,8 +423,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -433,12 +436,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -447,12 +450,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -461,10 +466,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -477,11 +480,11 @@ def encode_prompt(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -495,49 +498,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- original_image,
- batch_size,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ original_image,
+ batch_size,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -550,7 +551,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# image
@@ -559,12 +561,15 @@ def check_inputs(
else:
check_image_type = image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(image, list):
image_batch_size = len(image)
@@ -578,9 +583,7 @@ def check_inputs(
assert False
if batch_size != image_batch_size:
- raise ValueError(
- f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
- )
+ raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
# original_image
@@ -589,12 +592,15 @@ def check_inputs(
else:
check_image_type = original_image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(original_image, list):
image_batch_size = len(original_image)
@@ -613,8 +619,7 @@ def check_inputs(
)
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
- def preprocess_original_image(self,
- image: PIL.Image.Image) -> paddle.Tensor:
+ def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
if not isinstance(image, list):
image = [image]
@@ -642,21 +647,16 @@ def numpy_to_pd(images):
image = numpy_to_pd(image) # to pd
elif isinstance(image[0], np.ndarray):
- image = (np.concatenate(
- image, axis=0) if image[0].ndim == 4 else np.stack(
- image, axis=0))
+ image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
image = numpy_to_pd(image)
elif isinstance(image[0], paddle.Tensor):
- image = (paddle.concat(
- image, axis=0) if image[0].ndim == 4 else paddle.stack(
- image, axis=0))
+ image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
return image
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
- def preprocess_image(self, image: PIL.Image.Image,
- num_images_per_prompt) -> paddle.Tensor:
+ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
image = [image]
@@ -679,8 +679,7 @@ def preprocess_image(self, image: PIL.Image.Image,
elif dims == 4:
image = paddle.concat(image, axis=0)
else:
- raise ValueError(
- f"Image must have 3 or 4 dimensions, instead got {dims}")
+ raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
image = image.cast(self.unet.dtype)
@@ -691,8 +690,7 @@ def preprocess_image(self, image: PIL.Image.Image,
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
@@ -700,13 +698,7 @@ def get_timesteps(self, num_inference_steps, strength):
return timesteps, num_inference_steps - t_start
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
- def prepare_intermediate_images(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
_, channels, height, width = image.shape
batch_size = batch_size * num_images_per_prompt
@@ -729,30 +721,35 @@ def prepare_intermediate_images(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
- original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray,
- List[PIL.Image.Image], List[
- paddle.Tensor], List[np.ndarray], ]=None,
- strength: float=0.8,
- prompt: Union[str, List[str]]=None,
- num_inference_steps: int=50,
- timesteps: List[int]=None,
- guidance_scale: float=4.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- noise_level: int=250,
- clean_caption: bool=True, ):
+ self,
+ image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+ original_image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ strength: float = 0.8,
+ prompt: Union[str, List[str]] = None,
+ num_inference_steps: int = 50,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ noise_level: int = 250,
+ clean_caption: bool = True,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -848,7 +845,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
@@ -865,11 +863,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
dtype = prompt_embeds.dtype
@@ -882,8 +880,7 @@ def __call__(
self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. prepare original image
original_image = self.preprocess_original_image(original_image)
@@ -891,8 +888,7 @@ def __call__(
# 6. Prepare intermediate images
noise_timestep = timesteps[0:1]
- noise_timestep = noise_timestep.tile(
- (batch_size * num_images_per_prompt, ))
+ noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
intermediate_images = self.prepare_intermediate_images(
original_image,
@@ -900,21 +896,19 @@ def __call__(
batch_size,
num_images_per_prompt,
dtype,
- generator, )
+ generator,
+ )
# 7. Prepare upscaled image and noise level
_, _, height, width = original_image.shape
image = self.preprocess_image(image, num_images_per_prompt)
- upscaled = F.interpolate(
- image, (height, width), mode="bilinear", align_corners=True)
+ upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
- noise = randn_tensor(
- upscaled.shape, generator=generator, dtype=upscaled.dtype)
- upscaled = self.image_noising_scheduler.add_noise(
- upscaled, noise, timesteps=noise_level)
+ noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+ upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
if do_classifier_free_guidance:
noise_level = paddle.concat([noise_level] * 2)
@@ -923,19 +917,15 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
model_input = paddle.concat(
- [
- intermediate_images,
- upscaled.cast(intermediate_images.dtype)
- ],
- axis=1, )
-
- model_input = (paddle.concat([model_input] * 2)
- if do_classifier_free_guidance else model_input)
+ [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+ axis=1,
+ )
+
+ model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -944,7 +934,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
@@ -952,31 +943,27 @@ def __call__(
noise_pred_uncond, _ = noise_pred_uncond.split(
[
model_input.shape[1] // 2,
- noise_pred_uncond.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1] // 2,
- noise_pred_text.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_text.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs
+ ).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -989,16 +976,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 11. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 12. Convert to PIL
image = self.numpy_to_pil(image)
# 13. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image,
- self.unet.config.sample_size)
+ self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -1008,8 +993,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 11. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -1017,4 +1001,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index 72fd143c156c2..5ff5992901c78 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -21,14 +21,19 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
- is_ftfy_available, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ PIL_INTERPOLATION,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -56,8 +61,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
else:
h = int(round(img_size / 8 / coef) * 8)
- images = images.resize(
- (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
return images
@@ -130,8 +134,8 @@ class IFInpaintingPipeline(DiffusionPipeline):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -142,15 +146,16 @@ class IFInpaintingPipeline(DiffusionPipeline):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -176,20 +181,22 @@ def __init__(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
@paddle.no_grad()
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -216,7 +223,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -229,32 +237,32 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -269,8 +277,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -283,12 +290,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -297,12 +304,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -312,10 +321,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -328,11 +335,11 @@ def encode_prompt(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -346,49 +353,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- mask_image,
- batch_size,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ mask_image,
+ batch_size,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -401,7 +406,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# image
@@ -410,12 +416,15 @@ def check_inputs(
else:
check_image_type = image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(image, list):
image_batch_size = len(image)
@@ -429,9 +438,7 @@ def check_inputs(
assert False
if batch_size != image_batch_size:
- raise ValueError(
- f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
- )
+ raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
# mask_image
@@ -440,12 +447,15 @@ def check_inputs(
else:
check_image_type = mask_image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(mask_image, list):
image_batch_size = len(mask_image)
@@ -466,14 +476,12 @@ def check_inputs(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -500,11 +508,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -531,7 +541,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -558,15 +569,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -584,13 +593,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -636,15 +642,11 @@ def numpy_to_pd(images):
image = numpy_to_pd(image) # to pd
elif isinstance(image[0], np.ndarray):
- image = (np.concatenate(
- image, axis=0) if image[0].ndim == 4 else np.stack(
- image, axis=0))
+ image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
image = numpy_to_pd(image)
elif isinstance(image[0], paddle.Tensor):
- image = (paddle.concat(
- image, axis=0) if image[0].ndim == 4 else paddle.stack(
- image, axis=0))
+ image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
return image
@@ -653,10 +655,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
mask_image = [mask_image]
if isinstance(mask_image[0], paddle.Tensor):
- mask_image = (paddle.concat(
- mask_image, axis=0)
- if mask_image[0].ndim == 4 else paddle.stack(
- mask_image, axis=0))
+ mask_image = (
+ paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
+ )
if mask_image.ndim == 2:
# Batch and add channel dim for single mask
@@ -692,8 +693,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
mask_image = paddle.to_tensor(mask_image)
elif isinstance(mask_image[0], np.ndarray):
- mask_image = np.concatenate(
- [m[None, None, :] for m in mask_image], axis=0)
+ mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
mask_image[mask_image < 0.5] = 0
mask_image[mask_image >= 0.5] = 1
@@ -704,8 +704,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
@@ -713,14 +712,15 @@ def get_timesteps(self, num_inference_steps, strength):
return timesteps, num_inference_steps - t_start
def prepare_intermediate_images(
- self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- mask_image,
- generator=None, ):
+ self,
+ image,
+ timestep,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ mask_image,
+ generator=None,
+ ):
image_batch_size, channels, height, width = image.shape
batch_size = batch_size * num_images_per_prompt
@@ -745,29 +745,41 @@ def prepare_intermediate_images(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
- PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
- mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
- PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
- strength: float=1.0,
- num_inference_steps: int=50,
- timesteps: List[int]=None,
- guidance_scale: float=7.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- clean_caption: bool=True,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ mask_image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ strength: float = 1.0,
+ num_inference_steps: int = 50,
+ timesteps: List[int] = None,
+ guidance_scale: float = 7.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -864,7 +876,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -879,11 +892,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
dtype = prompt_embeds.dtype
@@ -896,8 +909,7 @@ def __call__(
self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. Prepare intermediate images
image = self.preprocess_image(image)
@@ -907,15 +919,12 @@ def __call__(
mask_image = mask_image.cast(dtype)
if mask_image.shape[0] == 1:
- mask_image = mask_image.repeat_interleave(
- batch_size * num_images_per_prompt, axis=0)
+ mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
else:
- mask_image = mask_image.repeat_interleave(
- num_images_per_prompt, axis=0)
+ mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
noise_timestep = timesteps[0:1]
- noise_timestep = noise_timestep.tile(
- (batch_size * num_images_per_prompt, ))
+ noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
intermediate_images = self.prepare_intermediate_images(
image,
@@ -924,19 +933,19 @@ def __call__(
num_images_per_prompt,
dtype,
mask_image,
- generator, )
+ generator,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
- model_input = (paddle.concat([intermediate_images] * 2)
- if do_classifier_free_guidance else
- intermediate_images)
+ model_input = (
+ paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+ )
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -944,7 +953,8 @@ def __call__(
model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
@@ -954,33 +964,29 @@ def __call__(
model_input.shape[1],
noise_pred_uncond.shape[1] - model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1],
noise_pred_text.shape[1] - model_input.shape[1],
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
# compute the previous noisy sample x_t -> x_t-1
prev_intermediate_images = intermediate_images
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs
+ ).prev_sample
- intermediate_images = (
- 1 - mask_image
- ) * prev_intermediate_images + mask_image * intermediate_images
+ intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -993,16 +999,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
# 11. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image,
- self.unet.config.sample_size)
+ self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -1013,8 +1017,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 9. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -1022,4 +1025,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index a9d271872306a..7b1c73e660a40 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -22,14 +22,19 @@
import paddle
import paddle.nn.functional as F
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
- is_ftfy_available, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ PIL_INTERPOLATION,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
else:
h = int(round(img_size / 8 / coef) * 8)
- images = images.resize(
- (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
return images
@@ -132,8 +136,8 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -144,16 +148,17 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- image_noising_scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ image_noising_scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -185,20 +190,19 @@ def __init__(
image_noising_scheduler=image_noising_scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -225,11 +229,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -256,7 +262,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -283,15 +290,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -309,13 +314,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -335,14 +337,15 @@ def _clean_caption(self, caption):
@paddle.no_grad()
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -369,7 +372,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -382,31 +386,31 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -421,8 +425,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -435,12 +438,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -449,12 +452,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -464,10 +469,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -480,11 +483,11 @@ def encode_prompt(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -498,50 +501,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- original_image,
- mask_image,
- batch_size,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ original_image,
+ mask_image,
+ batch_size,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -554,7 +555,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# image
@@ -563,12 +565,15 @@ def check_inputs(
else:
check_image_type = image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(image, list):
image_batch_size = len(image)
@@ -582,9 +587,7 @@ def check_inputs(
assert False
if batch_size != image_batch_size:
- raise ValueError(
- f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
- )
+ raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
# original_image
@@ -593,12 +596,15 @@ def check_inputs(
else:
check_image_type = original_image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(original_image, list):
image_batch_size = len(original_image)
@@ -623,12 +629,15 @@ def check_inputs(
else:
check_image_type = mask_image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(mask_image, list):
image_batch_size = len(mask_image)
@@ -647,8 +656,7 @@ def check_inputs(
)
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
- def preprocess_original_image(self,
- image: PIL.Image.Image) -> paddle.Tensor:
+ def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
if not isinstance(image, list):
image = [image]
@@ -676,21 +684,16 @@ def numpy_to_pd(images):
image = numpy_to_pd(image) # to pd
elif isinstance(image[0], np.ndarray):
- image = (np.concatenate(
- image, axis=0) if image[0].ndim == 4 else np.stack(
- image, axis=0))
+ image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
image = numpy_to_pd(image)
elif isinstance(image[0], paddle.Tensor):
- image = (paddle.concat(
- image, axis=0) if image[0].ndim == 4 else paddle.stack(
- image, axis=0))
+ image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
return image
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
- def preprocess_image(self, image: PIL.Image.Image,
- num_images_per_prompt) -> paddle.Tensor:
+ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
image = [image]
@@ -713,8 +716,7 @@ def preprocess_image(self, image: PIL.Image.Image,
elif dims == 4:
image = paddle.concat(image, axis=0)
else:
- raise ValueError(
- f"Image must have 3 or 4 dimensions, instead got {dims}")
+ raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
image = image.cast(self.unet.dtype)
@@ -728,10 +730,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
mask_image = [mask_image]
if isinstance(mask_image[0], paddle.Tensor):
- mask_image = (paddle.concat(
- mask_image, axis=0)
- if mask_image[0].ndim == 4 else paddle.stack(
- mask_image, axis=0))
+ mask_image = (
+ paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
+ )
if mask_image.ndim == 2:
# Batch and add channel dim for single mask
@@ -767,8 +768,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
mask_image = paddle.to_tensor(mask_image)
elif isinstance(mask_image[0], np.ndarray):
- mask_image = np.concatenate(
- [m[None, None, :] for m in mask_image], axis=0)
+ mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
mask_image[mask_image < 0.5] = 0
mask_image[mask_image >= 0.5] = 1
@@ -779,8 +779,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start:]
@@ -789,14 +788,15 @@ def get_timesteps(self, num_inference_steps, strength):
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
def prepare_intermediate_images(
- self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- mask_image,
- generator=None, ):
+ self,
+ image,
+ timestep,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ mask_image,
+ generator=None,
+ ):
image_batch_size, channels, height, width = image.shape
batch_size = batch_size * num_images_per_prompt
@@ -821,32 +821,43 @@ def prepare_intermediate_images(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
- original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray,
- List[PIL.Image.Image], List[
- paddle.Tensor], List[np.ndarray], ]=None,
- mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
- PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
- strength: float=0.8,
- prompt: Union[str, List[str]]=None,
- num_inference_steps: int=100,
- timesteps: List[int]=None,
- guidance_scale: float=4.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- noise_level: int=0,
- clean_caption: bool=True, ):
+ self,
+ image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+ original_image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ mask_image: Union[
+ PIL.Image.Image,
+ paddle.Tensor,
+ np.ndarray,
+ List[PIL.Image.Image],
+ List[paddle.Tensor],
+ List[np.ndarray],
+ ] = None,
+ strength: float = 0.8,
+ prompt: Union[str, List[str]] = None,
+ num_inference_steps: int = 100,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ noise_level: int = 0,
+ clean_caption: bool = True,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -948,7 +959,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
@@ -965,11 +977,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
dtype = prompt_embeds.dtype
@@ -982,8 +994,7 @@ def __call__(
self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. prepare original image
original_image = self.preprocess_original_image(original_image)
@@ -994,16 +1005,13 @@ def __call__(
mask_image = mask_image.cast(dtype)
if mask_image.shape[0] == 1:
- mask_image = mask_image.repeat_interleave(
- batch_size * num_images_per_prompt, axis=0)
+ mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
else:
- mask_image = mask_image.repeat_interleave(
- num_images_per_prompt, axis=0)
+ mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
# 6. Prepare intermediate images
noise_timestep = timesteps[0:1]
- noise_timestep = noise_timestep.tile(
- (batch_size * num_images_per_prompt, ))
+ noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
intermediate_images = self.prepare_intermediate_images(
original_image,
@@ -1012,21 +1020,19 @@ def __call__(
num_images_per_prompt,
dtype,
mask_image,
- generator, )
+ generator,
+ )
# 7. Prepare upscaled image and noise level
_, _, height, width = original_image.shape
image = self.preprocess_image(image, num_images_per_prompt)
- upscaled = F.interpolate(
- image, (height, width), mode="bilinear", align_corners=True)
+ upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
- noise = randn_tensor(
- upscaled.shape, generator=generator, dtype=upscaled.dtype)
- upscaled = self.image_noising_scheduler.add_noise(
- upscaled, noise, timesteps=noise_level)
+ noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+ upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
if do_classifier_free_guidance:
noise_level = paddle.concat([noise_level] * 2)
@@ -1035,19 +1041,15 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
model_input = paddle.concat(
- [
- intermediate_images,
- upscaled.cast(intermediate_images.dtype)
- ],
- axis=1, )
-
- model_input = (paddle.concat([model_input] * 2)
- if do_classifier_free_guidance else model_input)
+ [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+ axis=1,
+ )
+
+ model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -1056,7 +1058,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
@@ -1064,37 +1067,31 @@ def __call__(
noise_pred_uncond, _ = noise_pred_uncond.split(
[
model_input.shape[1] // 2,
- noise_pred_uncond.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1] // 2,
- noise_pred_text.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_text.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
# compute the previous noisy sample x_t -> x_t-1
prev_intermediate_images = intermediate_images
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs
+ ).prev_sample
- intermediate_images = (
- 1 - mask_image
- ) * prev_intermediate_images + mask_image * intermediate_images
+ intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -1107,16 +1104,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 11. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 12. Convert to PIL
image = self.numpy_to_pil(image)
# 13. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image,
- self.unet.config.sample_size)
+ self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -1127,8 +1122,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 11. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -1136,4 +1130,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index b2aa43abe1a5c..ce92083c54c1a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -22,13 +22,18 @@
import paddle
import paddle.nn.functional as F
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
- T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available,
- logging, randn_tensor, replace_example_docstring)
+from ...utils import (
+ BACKENDS_MAPPING,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import IFPipelineOutput
from .safety_checker import IFSafetyChecker
@@ -86,8 +91,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline):
watermarker: Optional[IFWatermarker]
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
- "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa
+ r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ ) # noqa
_optional_components = [
"tokenizer",
@@ -98,16 +103,17 @@ class IFSuperResolutionPipeline(DiffusionPipeline):
]
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- unet: UNet2DConditionModel,
- scheduler: DDPMScheduler,
- image_noising_scheduler: DDPMScheduler,
- safety_checker: Optional[IFSafetyChecker],
- feature_extractor: Optional[CLIPImageProcessor],
- watermarker: Optional[IFWatermarker],
- requires_safety_checker: bool=True, ):
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ unet: UNet2DConditionModel,
+ scheduler: DDPMScheduler,
+ image_noising_scheduler: DDPMScheduler,
+ safety_checker: Optional[IFSafetyChecker],
+ feature_extractor: Optional[CLIPImageProcessor],
+ watermarker: Optional[IFWatermarker],
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -139,20 +145,19 @@ def __init__(
image_noising_scheduler=image_noising_scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- watermarker=watermarker, )
+ watermarker=watermarker,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
- "Setting `clean_caption=True`"))
+ logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
logger.warn("Setting `clean_caption` to False...")
clean_caption = False
@@ -179,11 +184,13 @@ def _clean_caption(self, caption):
caption = re.sub(
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
caption = re.sub(
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
- caption, ) # regex for urls
+ caption,
+ ) # regex for urls
# html:
caption = BeautifulSoup(caption, features="html.parser").text
@@ -210,7 +217,8 @@ def _clean_caption(self, caption):
caption = re.sub(
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
- caption, )
+ caption,
+ )
# кавычки к одному стандарту
caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -237,15 +245,13 @@ def _clean_caption(self, caption):
# "123456.."
caption = re.sub(r"\b\d{6,}\b", "", caption)
# filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
- "", caption)
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
#
caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
- caption = re.sub(self.bad_punct_regex, r" ",
- caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
@@ -263,13 +269,10 @@ def _clean_caption(self, caption):
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(
- r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
- caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
- caption) # j2d1a2a...
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -289,14 +292,15 @@ def _clean_caption(self, caption):
@paddle.no_grad()
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt(
- self,
- prompt,
- do_classifier_free_guidance=True,
- num_images_per_prompt=1,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- clean_caption: bool=False, ):
+ self,
+ prompt,
+ do_classifier_free_guidance=True,
+ num_images_per_prompt=1,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ clean_caption: bool = False,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -323,7 +327,8 @@ def encode_prompt(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -336,32 +341,32 @@ def encode_prompt(
max_length = 77
if prompt_embeds is None:
- prompt = self._text_preprocessing(
- prompt, clean_caption=clean_caption)
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=max_length,
truncation=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, max_length - 1:-1])
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {max_length} tokens: {removed_text}")
+ f" {max_length} tokens: {removed_text}"
+ )
attention_mask = text_inputs.attention_mask
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
if self.text_encoder is not None:
@@ -376,8 +381,7 @@ def encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -390,12 +394,12 @@ def encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
- uncond_tokens = self._text_preprocessing(
- uncond_tokens, clean_caption=clean_caption)
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
@@ -404,12 +408,14 @@ def encode_prompt(
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
attention_mask = uncond_input.attention_mask
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
@@ -419,10 +425,8 @@ def encode_prompt(
if dtype is not None:
negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
@@ -435,11 +439,11 @@ def encode_prompt(
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, nsfw_detected, watermark_detected = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(dtype),
+ )
else:
nsfw_detected = None
watermark_detected = None
@@ -453,49 +457,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- batch_size,
- noise_level,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ batch_size,
+ noise_level,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -508,10 +510,10 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- if (noise_level < 0 or noise_level >=
- self.image_noising_scheduler.config.num_train_timesteps):
+ if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
raise ValueError(
f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
)
@@ -521,12 +523,15 @@ def check_inputs(
else:
check_image_type = image
- if (not isinstance(check_image_type, paddle.Tensor) and
- not isinstance(check_image_type, PIL.Image.Image) and
- not isinstance(check_image_type, np.ndarray)):
+ if (
+ not isinstance(check_image_type, paddle.Tensor)
+ and not isinstance(check_image_type, PIL.Image.Image)
+ and not isinstance(check_image_type, np.ndarray)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
- f" {type(check_image_type)}")
+ f" {type(check_image_type)}"
+ )
if isinstance(image, list):
image_batch_size = len(image)
@@ -540,13 +545,10 @@ def check_inputs(
assert False
if batch_size != image_batch_size:
- raise ValueError(
- f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
- )
+ raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
- def prepare_intermediate_images(self, batch_size, num_channels, height,
- width, dtype, generator):
+ def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
shape = (batch_size, num_channels, height, width)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
@@ -554,8 +556,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
- intermediate_images = randn_tensor(
- shape, generator=generator, dtype=dtype)
+ intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
# scale the initial noise by the standard deviation required by the scheduler
intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
@@ -584,8 +585,7 @@ def preprocess_image(self, image, num_images_per_prompt):
elif dims == 4:
image = paddle.concat(image, axis=0)
else:
- raise ValueError(
- f"Image must have 3 or 4 dimensions, instead got {dims}")
+ raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
image = image.cast(self.unet.dtype)
@@ -596,28 +596,28 @@ def preprocess_image(self, image, num_images_per_prompt):
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: int=None,
- width: int=None,
- image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor]=None,
- num_inference_steps: int=50,
- timesteps: List[int]=None,
- guidance_scale: float=4.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- noise_level: int=250,
- clean_caption: bool=True, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: int = None,
+ width: int = None,
+ image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor] = None,
+ num_inference_steps: int = 50,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ noise_level: int = 250,
+ clean_caption: bool = True,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -709,7 +709,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
@@ -729,11 +730,11 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clean_caption=clean_caption, )
+ clean_caption=clean_caption,
+ )
if do_classifier_free_guidance:
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
# 4. Prepare timesteps
if timesteps is not None:
@@ -752,39 +753,33 @@ def __call__(
height,
width,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Prepare upscaled image and noise level
image = self.preprocess_image(image, num_images_per_prompt)
- upscaled = F.interpolate(
- image, (height, width), mode="bilinear", align_corners=True)
+ upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
- noise = randn_tensor(
- upscaled.shape, generator=generator, dtype=upscaled.dtype)
- upscaled = self.image_noising_scheduler.add_noise(
- upscaled, noise, timesteps=noise_level)
+ noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+ upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
if do_classifier_free_guidance:
noise_level = paddle.concat([noise_level] * 2)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
model_input = paddle.concat(
- [
- intermediate_images,
- upscaled.cast(intermediate_images.dtype)
- ],
- axis=1, )
-
- model_input = (paddle.concat([model_input] * 2)
- if do_classifier_free_guidance else model_input)
+ [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+ axis=1,
+ )
+
+ model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
model_input = self.scheduler.scale_model_input(model_input, t)
# predict the noise residual
@@ -794,7 +789,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
cross_attention_kwargs=cross_attention_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -802,21 +798,19 @@ def __call__(
noise_pred_uncond, _ = noise_pred_uncond.split(
[
model_input.shape[1] // 2,
- noise_pred_uncond.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
model_input.shape[1] // 2,
- noise_pred_text.shape[1] - model_input.shape[1] //
- 2,
+ noise_pred_text.shape[1] - model_input.shape[1] // 2,
],
- axis=1, )
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
@@ -824,12 +818,11 @@ def __call__(
t,
intermediate_images,
**extra_step_kwargs,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, intermediate_images)
@@ -842,16 +835,14 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 10. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
# 11. Convert to PIL
image = self.numpy_to_pil(image)
# 12. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image,
- self.unet.config.sample_size)
+ self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pd":
nsfw_detected = None
watermark_detected = None
@@ -862,8 +853,7 @@ def __call__(
image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
# 10. Run safety checker
- image, nsfw_detected, watermark_detected = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, nsfw_detected, watermark_detected)
@@ -871,4 +861,5 @@ def __call__(
return IFPipelineOutput(
images=image,
nsfw_detected=nsfw_detected,
- watermark_detected=watermark_detected, )
+ watermark_detected=watermark_detected,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
index 8fcd1ab740f28..e4f32ce9b69a9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
@@ -15,8 +15,11 @@
import numpy as np
import paddle
import paddle.nn as nn
-from paddlenlp.transformers import (CLIPConfig, CLIPVisionModelWithProjection,
- PretrainedModel)
+from paddlenlp.transformers import (
+ CLIPConfig,
+ CLIPVisionModelWithProjection,
+ PretrainedModel,
+)
from ...utils import logging
@@ -46,7 +49,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
if any(nsfw_detected):
logger.warning(
"Potential NSFW content was detected in one or more images. A black image will be returned instead."
- " Try again with a different prompt and/or seed.")
+ " Try again with a different prompt and/or seed."
+ )
for idx, nsfw_detected_ in enumerate(nsfw_detected):
if nsfw_detected_:
@@ -60,7 +64,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
if any(watermark_detected):
logger.warning(
"Potential watermarked content was detected in one or more images. A black image will be returned instead."
- " Try again with a different prompt and/or seed.")
+ " Try again with a different prompt and/or seed."
+ )
for idx, watermark_detected_ in enumerate(watermark_detected):
if watermark_detected_:
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
index 998eb357d858a..ad156baf5b46f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
@@ -29,8 +29,8 @@ def __init__(self):
self.register_buffer(
"watermark_image",
- paddle.zeros(
- (62, 62, 4), dtype=paddle.get_default_dtype()), )
+ paddle.zeros((62, 62, 4), dtype=paddle.get_default_dtype()),
+ )
self.watermark_image_as_pil = None
def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
@@ -45,9 +45,8 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
S1, S2 = 1024**2, img_w * img_h
- K = (S2 / S1)**0.5
- wm_size, wm_x, wm_y = int(K *
- 62), img_w - int(14 * K), img_h - int(14 * K)
+ K = (S2 / S1) ** 0.5
+ wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
if self.watermark_image_as_pil is None:
watermark_image = self.watermark_image.cpu().numpy().astype("uint8")
@@ -55,12 +54,14 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
self.watermark_image_as_pil = watermark_image
wm_img = self.watermark_image_as_pil.resize(
- (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
+ )
for pil_img in images:
pil_img.paste(
wm_img,
box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y),
- mask=wm_img.split()[-1], )
+ mask=wm_img.split()[-1],
+ )
return images
diff --git a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
index faf4f122a123f..ff5d4541cde55 100644
--- a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
+++ b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
@@ -44,14 +44,14 @@ class DiTPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- transformer: Transformer2DModel,
- vae: AutoencoderKL,
- scheduler: KarrasDiffusionSchedulers,
- id2label: Optional[Dict[int, str]]=None, ):
+ self,
+ transformer: Transformer2DModel,
+ vae: AutoencoderKL,
+ scheduler: KarrasDiffusionSchedulers,
+ id2label: Optional[Dict[int, str]] = None,
+ ):
super().__init__()
- self.register_modules(
- transformer=transformer, vae=vae, scheduler=scheduler)
+ self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
# create a imagenet -> id dictionary for easier use
self.labels = {}
@@ -88,14 +88,14 @@ def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
@paddle.no_grad()
def __call__(
- self,
- class_labels: List[int],
- guidance_scale: float=4.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- num_inference_steps: int=50,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ class_labels: List[int],
+ guidance_scale: float = 4.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ num_inference_steps: int = 50,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Function invoked when calling the pipeline for generation.
@@ -123,24 +123,22 @@ def __call__(
latents = randn_tensor(
shape=(batch_size, latent_channels, latent_size, latent_size),
generator=generator,
- dtype=self.transformer.dtype, )
- latent_model_input = (paddle.concat([latents] * 2)
- if guidance_scale > 1 else latents)
+ dtype=self.transformer.dtype,
+ )
+ latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents
class_labels = paddle.to_tensor(class_labels).flatten()
class_null = paddle.to_tensor([1000] * batch_size)
- class_labels_input = (paddle.concat([class_labels, class_null], 0)
- if guidance_scale > 1 else class_labels)
+ class_labels_input = paddle.concat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
# set step values
self.scheduler.set_timesteps(num_inference_steps)
for t in self.progress_bar(self.scheduler.timesteps):
if guidance_scale > 1:
- half = latent_model_input[:len(latent_model_input) // 2]
+ half = latent_model_input[: len(latent_model_input) // 2]
latent_model_input = paddle.concat([half, half], axis=0)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
timesteps = t
if not paddle.is_tensor(timesteps):
@@ -154,22 +152,25 @@ def __call__(
elif len(timesteps.shape) == 0:
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps.expand([latent_model_input.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ latent_model_input.shape[0],
+ ]
+ )
# predict noise model_output
noise_pred = self.transformer(
- latent_model_input,
- timestep=timesteps,
- class_labels=class_labels_input).sample
+ latent_model_input, timestep=timesteps, class_labels=class_labels_input
+ ).sample
# perform guidance
if guidance_scale > 1:
eps, rest = (
noise_pred[:, :latent_channels],
- noise_pred[:, latent_channels:], )
+ noise_pred[:, latent_channels:],
+ )
bs = eps.shape[0]
# TODO torch.split vs paddle.split
- cond_eps, uncond_eps = paddle.split(
- eps, [bs // 2, bs - bs // 2], axis=0)
+ cond_eps, uncond_eps = paddle.split(eps, [bs // 2, bs - bs // 2], axis=0)
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
eps = paddle.concat([half_eps, half_eps], axis=0)
@@ -182,13 +183,13 @@ def __call__(
model_output, _ = paddle.split(
noise_pred,
[latent_channels, noise_pred.shape[1] - latent_channels],
- axis=1, )
+ axis=1,
+ )
else:
model_output = noise_pred
# compute previous image: x_t -> x_t-1
- latent_model_input = self.scheduler.step(
- model_output, t, latent_model_input).prev_sample
+ latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
if guidance_scale > 1:
latents, _ = latent_model_input.chunk(2, axis=0)
@@ -207,6 +208,6 @@ def __call__(
samples = self.numpy_to_pil(samples)
if not return_dict:
- return (samples, )
+ return (samples,)
return ImagePipelineOutput(images=samples)
diff --git a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
index 8f75881eec2ef..9b672f9c0f8a5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
@@ -26,18 +26,38 @@
from ..image_processor import VaeImageProcessor
from ..schedulers import (
- DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
- DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
- KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
- PreconfigEulerAncestralDiscreteScheduler, PreconfigLMSDiscreteScheduler,
- UniPCMultistepScheduler)
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ PreconfigEulerAncestralDiscreteScheduler,
+ PreconfigLMSDiscreteScheduler,
+ UniPCMultistepScheduler,
+)
from ..utils import (
- DIFFUSERS_CACHE, FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME,
- FROM_HF_HUB, HF_HUB_OFFLINE, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME,
- PPDIFFUSERS_CACHE, _add_variant, _get_model_file, is_fastdeploy_available,
- is_paddle_available, logging, randn_tensor)
+ DIFFUSERS_CACHE,
+ FASTDEPLOY_MODEL_NAME,
+ FASTDEPLOY_WEIGHTS_NAME,
+ FROM_HF_HUB,
+ HF_HUB_OFFLINE,
+ ONNX_EXTERNAL_WEIGHTS_NAME,
+ ONNX_WEIGHTS_NAME,
+ PPDIFFUSERS_CACHE,
+ _add_variant,
+ _get_model_file,
+ is_fastdeploy_available,
+ is_paddle_available,
+ logging,
+ randn_tensor,
+)
from ..version import VERSION as __version__
__all__ = ["FastDeployRuntimeModel", "FastDeployDiffusionPipelineMixin"]
@@ -54,9 +74,7 @@ def fdtensor2pdtensor(fdtensor: "fd.C.FDTensor"):
pdtensor = paddle.utils.dlpack.from_dlpack(dltensor)
return pdtensor
- def pdtensor2fdtensor(pdtensor: paddle.Tensor,
- name: str="",
- share_with_raw_ptr=False):
+ def pdtensor2fdtensor(pdtensor: paddle.Tensor, name: str = "", share_with_raw_ptr=False):
if not share_with_raw_ptr:
dltensor = paddle.utils.dlpack.to_dlpack(pdtensor)
return fd.C.FDTensor.from_dlpack(name, dltensor)
@@ -67,7 +85,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor,
pdtensor.shape,
pdtensor.dtype.name,
str(pdtensor.place),
- int(pdtensor.place.gpu_device_id()), )
+ int(pdtensor.place.gpu_device_id()),
+ )
logger = logging.get_logger(__name__)
@@ -88,7 +107,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor,
[^\\()\[\]:]+|
:
""",
- re.X, )
+ re.X,
+)
def parse_prompt_attention(text):
@@ -207,32 +227,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
tokens.append(text_token)
weights.append(text_weight)
if truncated:
- logger.warning(
- "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
- )
+ logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
return tokens, weights
-def pad_tokens_and_weights(tokens,
- weights,
- max_length,
- bos,
- eos,
- pad,
- no_boseos_middle=True,
- chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
r"""
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
"""
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
- weights_length = (max_length if no_boseos_middle else
- max_embeddings_multiples * chunk_length)
+ weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
for i in range(len(tokens)):
- tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
- (max_length - 2 - len(tokens[i])))
+ tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
if no_boseos_middle:
- weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
- len(weights[i]))
+ weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
else:
w = []
if len(weights[i]) == 0:
@@ -240,23 +248,21 @@ def pad_tokens_and_weights(tokens,
else:
for j in range(max_embeddings_multiples):
w.append(1.0) # weight for starting token in this chunk
- w += weights[i][j * (chunk_length - 2):min(
- len(weights[i]), (j + 1) * (chunk_length - 2))]
+ w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
w.append(1.0) # weight for ending token in this chunk
w += [1.0] * (weights_length - len(w))
weights[i] = w[:]
# we must to tensor first!
- return paddle.to_tensor(
- tokens, dtype="int64"), paddle.to_tensor(
- weights, dtype="float32")
+ return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32")
def get_unweighted_text_embeddings(
- pipe,
- text_input: paddle.Tensor,
- chunk_length: int,
- no_boseos_middle: Optional[bool]=True,
- infer_op=None, ):
+ pipe,
+ text_input: paddle.Tensor,
+ chunk_length: int,
+ no_boseos_middle: Optional[bool] = True,
+ infer_op=None,
+):
"""
When the length of tokens is a multiple of the capacity of the text encoder,
it should be split into chunks and sent to the text encoder individually.
@@ -267,8 +273,7 @@ def get_unweighted_text_embeddings(
text_embeddings = []
for i in range(max_embeddings_multiples):
# extract the i-th chunk
- text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
- chunk_length - 2) + 2].clone()
+ text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
# cover the head and the tail by the starting and the ending tokens
text_input_chunk[:, 0] = text_input[0, 0]
@@ -282,7 +287,8 @@ def get_unweighted_text_embeddings(
text_embedding = pipe.text_encoder(
input_ids=text_input_chunk,
infer_op=infer_op,
- output_shape=output_shape, )[0]
+ output_shape=output_shape,
+ )[0]
if no_boseos_middle:
if i == 0:
# discard the ending token
@@ -305,20 +311,22 @@ def get_unweighted_text_embeddings(
text_embeddings = pipe.text_encoder(
input_ids=text_input,
infer_op=infer_op,
- output_shape=output_shape, )[0]
+ output_shape=output_shape,
+ )[0]
return text_embeddings
def get_weighted_text_embeddings(
- pipe,
- prompt: Union[str, List[str]],
- uncond_prompt: Optional[Union[str, List[str]]]=None,
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- infer_op=None,
- **kwargs, ):
+ pipe,
+ prompt: Union[str, List[str]],
+ uncond_prompt: Optional[Union[str, List[str]]] = None,
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ infer_op=None,
+ **kwargs,
+):
r"""
Prompts can be assigned with local weights using brackets. For example,
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -342,24 +350,19 @@ def get_weighted_text_embeddings(
skip_weighting (`bool`, *optional*, defaults to `False`):
Skip the weighting. When the parsing is skipped, it is forced True.
"""
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
if isinstance(prompt, str):
prompt = [prompt]
if not skip_parsing:
- prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
- max_length - 2)
+ prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
if uncond_prompt is not None:
if isinstance(uncond_prompt, str):
uncond_prompt = [uncond_prompt]
- uncond_tokens, uncond_weights = get_prompts_with_weights(
- pipe, uncond_prompt, max_length - 2)
+ uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
else:
prompt_tokens = [
- token[1:-1]
- for token in pipe.tokenizer(
- prompt, max_length=max_length, truncation=True).input_ids
+ token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
]
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
if uncond_prompt is not None:
@@ -367,33 +370,26 @@ def get_weighted_text_embeddings(
uncond_prompt = [uncond_prompt]
uncond_tokens = [
token[1:-1]
- for token in pipe.tokenizer(
- uncond_prompt, max_length=max_length, truncation=True)
- .input_ids
+ for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
]
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
# round up the longest length of tokens to a multiple of (model_max_length - 2)
max_length = max([len(token) for token in prompt_tokens])
if uncond_prompt is not None:
- max_length = max(max_length,
- max([len(token) for token in uncond_tokens]))
+ max_length = max(max_length, max([len(token) for token in uncond_tokens]))
max_embeddings_multiples = min(
max_embeddings_multiples,
- (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+ (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+ )
max_embeddings_multiples = max(1, max_embeddings_multiples)
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
# pad the length of tokens and weights
# support bert tokenizer
- bos = (pipe.tokenizer.bos_token_id
- if pipe.tokenizer.bos_token_id is not None else
- pipe.tokenizer.cls_token_id)
- eos = (pipe.tokenizer.eos_token_id
- if pipe.tokenizer.eos_token_id is not None else
- pipe.tokenizer.sep_token_id)
+ bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+ eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
pad = pipe.tokenizer.pad_token_id
prompt_tokens, prompt_weights = pad_tokens_and_weights(
@@ -404,7 +400,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
if uncond_prompt is not None:
uncond_tokens, uncond_weights = pad_tokens_and_weights(
uncond_tokens,
@@ -414,35 +411,34 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
# get the embeddings
text_embeddings = get_unweighted_text_embeddings(
pipe,
prompt_tokens,
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
- infer_op=infer_op, )
+ infer_op=infer_op,
+ )
if uncond_prompt is not None:
uncond_embeddings = get_unweighted_text_embeddings(
pipe,
uncond_tokens,
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
- infer_op=infer_op, )
+ infer_op=infer_op,
+ )
# assign weights to the prompts and normalize in the sense of mean
# TODO: should we normalize by chunk or in a whole (current implementation)?
if (not skip_parsing) and (not skip_weighting):
previous_mean = text_embeddings.mean(axis=[-2, -1])
text_embeddings *= prompt_weights.unsqueeze(-1)
- text_embeddings *= (
- (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
- .unsqueeze(-1))
+ text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
previous_mean = uncond_embeddings.mean(axis=[-2, -1])
uncond_embeddings *= uncond_weights.unsqueeze(-1)
- uncond_embeddings *= (
- (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
- .unsqueeze(-1).unsqueeze(-1))
+ uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
if uncond_prompt is not None:
return text_embeddings, uncond_embeddings
@@ -459,8 +455,7 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs):
continue
module = getattr(self, name)
if isinstance(module, FastDeployRuntimeModel):
- infer_op = (infer_op_dict.get(name, "zero_copy_infer")
- if module.is_spport_zero_copy() else "raw")
+ infer_op = infer_op_dict.get(name, "zero_copy_infer") if module.is_spport_zero_copy() else "raw"
# if parse_prompt_type in ["lpw", "webui"] and name in ["text_encoder"]:
# if infer_op != "raw":
# logger.warning(
@@ -470,19 +465,16 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs):
new_infer_op_dict[name] = infer_op
return new_infer_op_dict
- def post_init(self,
- vae_scaling_factor=0.18215,
- vae_scale_factor=8,
- dtype="float32"):
+ def post_init(self, vae_scaling_factor=0.18215, vae_scale_factor=8, dtype="float32"):
self.vae_scaling_factor = vae_scaling_factor
self.vae_scale_factor = vae_scale_factor
- self.image_processor = VaeImageProcessor(
- vae_scale_factor=vae_scale_factor, do_convert_rgb=True)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True)
self.control_image_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor,
do_convert_rgb=True,
- do_normalize=False, )
+ do_normalize=False,
+ )
self.dtype = dtype
self.supported_scheduler = [
"pndm",
@@ -533,53 +525,44 @@ def text_encoder_hidden_states_dim(self):
def change_scheduler(self, scheduler_type="ddim", inplace=True):
scheduler_type = scheduler_type.lower()
if scheduler_type == "pndm":
- scheduler = PNDMScheduler.from_config(
- self.orginal_scheduler_config, skip_prk_steps=True)
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
elif scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "preconfig-lms":
- scheduler = PreconfigLMSDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "heun":
- scheduler = HeunDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler":
- scheduler = EulerDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "preconfig-euler-ancestral":
- scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-multi":
- scheduler = DPMSolverMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "dpm-single":
- scheduler = DPMSolverSinglestepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2-ancestral":
- scheduler = KDPM2AncestralDiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "kdpm2":
- scheduler = KDPM2DiscreteScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "unipc-multi":
- scheduler = UniPCMultistepScheduler.from_config(
- self.orginal_scheduler_config)
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
elif scheduler_type == "ddim":
scheduler = DDIMScheduler.from_config(
self.orginal_scheduler_config,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
elif scheduler_type == "ddpm":
- scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
- )
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
elif scheduler_type == "deis-multi":
scheduler = DEISMultistepScheduler.from_config(
- self.orginal_scheduler_config, )
+ self.orginal_scheduler_config,
+ )
else:
raise ValueError(
f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -590,16 +573,13 @@ def change_scheduler(self, scheduler_type="ddim", inplace=True):
def get_timesteps(self, num_inference_steps, strength=1.0):
if strength >= 1:
- return self.scheduler.timesteps.cast(
- self.dtype), num_inference_steps
+ return self.scheduler.timesteps.cast(self.dtype), num_inference_steps
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[
- t_start * self.scheduler.order:].cast(self.dtype)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].cast(self.dtype)
if hasattr(self.scheduler, "step_index_offset"):
self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -615,24 +595,24 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
return timesteps, num_inference_steps
def prepare_controlnet_cond(
- self,
- controlnet_cond,
- controlnet_conditioning_scale,
- width,
- height,
- batch_size,
- num_images_per_prompt,
- do_classifier_free_guidance=False, ):
+ self,
+ controlnet_cond,
+ controlnet_conditioning_scale,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ do_classifier_free_guidance=False,
+ ):
control_image = self.control_image_processor.preprocess(
controlnet_cond,
height=height,
- width=width, )
+ width=width,
+ )
if isinstance(controlnet_conditioning_scale, (float, int)):
- controlnet_conditioning_scale = paddle.to_tensor(
- [controlnet_conditioning_scale] * 13, dtype=self.dtype)
+ controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype)
elif isinstance(controlnet_conditioning_scale, (list, tuple)):
- controlnet_conditioning_scale = paddle.to_tensor(
- controlnet_conditioning_scale, dtype=self.dtype)
+ controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype)
else:
raise ValueError(
f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
@@ -650,40 +630,40 @@ def prepare_controlnet_cond(
return control_image, controlnet_conditioning_scale
def check_inputs(
- self,
- prompt,
- height=512,
- width=512,
- callback_steps=1,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- strength=1.0, ):
+ self,
+ prompt,
+ height=512,
+ width=512,
+ callback_steps=1,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ strength=1.0,
+ ):
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
raise ValueError(
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
)
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -696,24 +676,25 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
def prepare_latents(
- self,
- batch_size,
- height,
- width,
- generator,
- latents=None,
- image=None,
- timestep=None,
- is_strength_max=True,
- return_noise=False,
- return_image_latents=False,
- infer_op=None, ):
+ self,
+ batch_size,
+ height,
+ width,
+ generator,
+ latents=None,
+ image=None,
+ timestep=None,
+ is_strength_max=True,
+ return_noise=False,
+ return_image_latents=False,
+ infer_op=None,
+ ):
shape = [
batch_size,
self.vae_decoder_num_latent_channels,
@@ -739,46 +720,44 @@ def prepare_latents(
if latents is None:
noise = randn_tensor(shape, generator=generator, dtype=self.dtype)
# if strength is 1. then initialise the latents to noise, else initial to image + noise
- latents = (noise if is_strength_max else
- self.scheduler.add_noise(image_latents, noise, timestep))
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
# if pure noise then scale the initial latents by the Scheduler's init sigma
- latents = (latents * self.scheduler.init_noise_sigma
- if is_strength_max else latents)
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
else:
noise = latents
if str(noise.dtype).replace("paddle.", "") != self.dtype:
noise = noise.cast(self.dtype)
latents = noise * self.scheduler.init_noise_sigma
- outputs = (latents, )
+ outputs = (latents,)
if return_noise:
- outputs += (noise, )
+ outputs += (noise,)
if return_image_latents:
- outputs += (image_latents, )
+ outputs += (image_latents,)
if len(outputs) == 1:
outputs = latents
return outputs
def prepare_mask_latents(
- self,
- mask,
- masked_image,
- batch_size,
- height,
- width,
- do_classifier_free_guidance,
- return_masked_image_latents=True,
- infer_op=None, ):
+ self,
+ mask,
+ masked_image,
+ batch_size,
+ height,
+ width,
+ do_classifier_free_guidance,
+ return_masked_image_latents=True,
+ infer_op=None,
+ ):
# resize the mask to latents shape as we concatenate the mask to the latents
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
# and half precision
mask = paddle.nn.functional.interpolate(
- mask,
- size=(height // self.vae_scale_factor,
- width // self.vae_scale_factor))
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+ )
mask = mask.cast(dtype=self.dtype)
# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -791,8 +770,7 @@ def prepare_mask_latents(
)
mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
- mask = paddle.concat([mask] *
- 2) if do_classifier_free_guidance else mask
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
if not return_masked_image_latents:
return mask
@@ -805,20 +783,18 @@ def prepare_mask_latents(
f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
" Make sure the number of images that you pass is divisible by the total requested batch size."
)
- masked_image_latents = masked_image_latents.tile(
- [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+ masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
- masked_image_latents = (paddle.concat([masked_image_latents] * 2)
- if do_classifier_free_guidance else
- masked_image_latents)
+ masked_image_latents = (
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
# aligning device to prevent device errors when concating it with the latent model input
masked_image_latents = masked_image_latents.cast(dtype=self.dtype)
return mask, masked_image_latents
def is_scheduler_support_step_index(self):
- kwargs_keys = set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs):
@@ -832,14 +808,12 @@ def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs):
image_latents = self.vae_encoder(
sample=image,
infer_op=infer_op,
- output_shape=output_shape, )[0]
+ output_shape=output_shape,
+ )[0]
return self.vae_scaling_factor * image_latents
- def _decode_vae_latents(self,
- latents: paddle.Tensor,
- infer_op=None,
- **kwargs):
+ def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
latents_shape = latents.shape
output_shape = [
latents_shape[0],
@@ -850,22 +824,24 @@ def _decode_vae_latents(self,
images_vae = self.vae_decoder(
latent_sample=latents,
infer_op=infer_op,
- output_shape=output_shape, )[0]
+ output_shape=output_shape,
+ )[0]
return images_vae
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- infer_op=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- **kwargs, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ infer_op=None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ **kwargs,
+ ):
if parse_prompt_type == "lpw":
return self._encode_prompt_lpw(
prompt,
@@ -876,7 +852,8 @@ def _encode_prompt(
negative_prompt_embeds=negative_prompt_embeds,
max_embeddings_multiples=max_embeddings_multiples,
infer_op="raw", # NOTE: we can't use zero copy!
- **kwargs, )
+ **kwargs,
+ )
elif parse_prompt_type == "raw":
return self._encode_prompt_raw(
prompt,
@@ -885,22 +862,23 @@ def _encode_prompt(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- infer_op=infer_op, )
+ infer_op=infer_op,
+ )
elif parse_prompt_type == "webui":
- raise NotImplementedError(
- "`parse_prompt_type=webui` is not implemented yet.")
+ raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
def _encode_prompt_lpw(
- self,
- prompt: Union[str, List[str]],
- num_images_per_prompt: int,
- do_classifier_free_guidance: bool,
- negative_prompt: Union[str, List[str]],
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- infer_op=None,
- max_embeddings_multiples: Optional[int]=3,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ num_images_per_prompt: int,
+ do_classifier_free_guidance: bool,
+ negative_prompt: Union[str, List[str]],
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ infer_op=None,
+ max_embeddings_multiples: Optional[int] = 3,
+ **kwargs,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -930,18 +908,19 @@ def _encode_prompt_lpw(
if do_classifier_free_guidance:
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif prompt is not None and type(prompt) is not type(
- negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -951,37 +930,35 @@ def _encode_prompt_lpw(
uncond_prompt=uncond_tokens,
max_embeddings_multiples=max_embeddings_multiples,
infer_op=infer_op,
- **kwargs, )
+ **kwargs,
+ )
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def _encode_prompt_raw(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- infer_op=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ infer_op=None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -1018,21 +995,22 @@ def _encode_prompt_raw(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest",
- return_tensors="pd").input_ids # check
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids # check
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
prompt_embeds = self.text_encoder(
input_ids=text_input_ids,
@@ -1041,13 +1019,13 @@ def _encode_prompt_raw(
batch_size,
self.tokenizer.model_max_length,
self.text_encoder_hidden_states_dim,
- ], )[0]
+ ],
+ )[0]
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -1056,14 +1034,16 @@ def _encode_prompt_raw(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -1073,7 +1053,8 @@ def _encode_prompt_raw(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
negative_prompt_embeds = self.text_encoder(
input_ids=uncond_input.input_ids,
infer_op=infer_op,
@@ -1081,21 +1062,19 @@ def _encode_prompt_raw(
batch_size,
max_length,
self.text_encoder_hidden_states_dim,
- ], )[0]
+ ],
+ )[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -1104,17 +1083,15 @@ def run_safety_checker(self, image):
has_nsfw_concept = None
else:
if paddle.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(
- image, output_type="pil")
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
- feature_extractor_input = self.image_processor.numpy_to_pil(
- image)
- safety_checker_input = self.feature_extractor(
- feature_extractor_input, return_tensors="np")
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np")
image, has_nsfw_concept = self.safety_checker(
images=image.numpy(),
clip_input=safety_checker_input.pixel_values.astype(self.dtype),
- infer_op="raw", )
+ infer_op="raw",
+ )
image = paddle.to_tensor(image, dtype=self.dtype)
return image, has_nsfw_concept
@@ -1124,15 +1101,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -1140,9 +1115,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
class FastDeployRuntimeModel:
def __init__(self, model=None, **kwargs):
- logger.info(
- "`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future."
- )
+ logger.info("`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future.")
self.model = model
self.model_save_dir = kwargs.get("model_save_dir", None)
self.model_format = kwargs.get("model_format", None)
@@ -1171,11 +1144,12 @@ def is_spport_zero_copy(self):
return False
def zero_copy_infer(
- self,
- prebinded_inputs: dict,
- prebinded_outputs: dict,
- share_with_raw_ptr=True,
- **kwargs, ):
+ self,
+ prebinded_inputs: dict,
+ prebinded_outputs: dict,
+ share_with_raw_ptr=True,
+ **kwargs,
+ ):
"""
Execute inference without copying data from cpu to gpu.
@@ -1186,17 +1160,11 @@ def zero_copy_infer(
List of output tensor.
"""
for inputs_name, inputs_tensor in prebinded_inputs.items():
- input_fdtensor = pdtensor2fdtensor(
- inputs_tensor,
- inputs_name,
- share_with_raw_ptr=share_with_raw_ptr)
+ input_fdtensor = pdtensor2fdtensor(inputs_tensor, inputs_name, share_with_raw_ptr=share_with_raw_ptr)
self.model.bind_input_tensor(inputs_name, input_fdtensor)
for outputs_name, outputs_tensor in prebinded_outputs.items():
- output_fdtensor = pdtensor2fdtensor(
- outputs_tensor,
- outputs_name,
- share_with_raw_ptr=share_with_raw_ptr)
+ output_fdtensor = pdtensor2fdtensor(outputs_tensor, outputs_name, share_with_raw_ptr=share_with_raw_ptr)
self.model.bind_output_tensor(outputs_name, output_fdtensor)
self.model.zero_copy_infer()
@@ -1222,25 +1190,27 @@ def __call__(self, **kwargs):
self.zero_copy_infer(
prebinded_inputs=inputs,
prebinded_outputs={self.model.get_output_info(0).name: output},
- share_with_raw_ptr=share_with_raw_ptr, )
- return [output, ]
+ share_with_raw_ptr=share_with_raw_ptr,
+ )
+ return [
+ output,
+ ]
elif infer_op == "raw":
inputs = {}
for k, v in kwargs.items():
if paddle.is_tensor(v):
v = v.numpy()
inputs[k] = np.array(v)
- return [
- paddle.to_tensor(output) for output in self.model.infer(inputs)
- ]
+ return [paddle.to_tensor(output) for output in self.model.infer(inputs)]
else:
raise ValueError("Unknown infer_op {}".format(infer_op))
@staticmethod
def load_model(
- model_path: Union[str, Path],
- params_path: Union[str, Path]=None,
- runtime_options: Optional["fd.RuntimeOption"]=None, ):
+ model_path: Union[str, Path],
+ params_path: Union[str, Path] = None,
+ runtime_options: Optional["fd.RuntimeOption"] = None,
+ ):
"""
Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption
@@ -1255,9 +1225,7 @@ def load_model(
"""
option = runtime_options
if option is None or not isinstance(runtime_options, fd.RuntimeOption):
- logger.info(
- "No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend."
- )
+ logger.info("No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend.")
option = fd.RuntimeOption()
option.use_paddle_backend()
option.use_cpu()
@@ -1275,11 +1243,12 @@ def load_model(
return fd.Runtime(option)
def _save_pretrained(
- self,
- save_directory: Union[str, Path],
- model_file_name: Optional[str]=None,
- params_file_name: Optional[str]=None,
- **kwargs, ):
+ self,
+ save_directory: Union[str, Path],
+ model_file_name: Optional[str] = None,
+ params_file_name: Optional[str] = None,
+ **kwargs,
+ ):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
[`~FastDeployRuntimeModel.from_pretrained`] class method. It will always save the
@@ -1296,11 +1265,14 @@ def _save_pretrained(
model with a different name.
"""
is_onnx_model = self.model_format == ModelFormat.ONNX
- model_file_name = (model_file_name if model_file_name is not None else
- FASTDEPLOY_MODEL_NAME
- if not is_onnx_model else ONNX_WEIGHTS_NAME)
- params_file_name = (params_file_name if params_file_name is not None
- else FASTDEPLOY_WEIGHTS_NAME)
+ model_file_name = (
+ model_file_name
+ if model_file_name is not None
+ else FASTDEPLOY_MODEL_NAME
+ if not is_onnx_model
+ else ONNX_WEIGHTS_NAME
+ )
+ params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
src_model_path = self.model_save_dir.joinpath(self.latest_model_name)
dst_model_path = Path(save_directory).joinpath(model_file_name)
@@ -1312,19 +1284,16 @@ def _save_pretrained(
if is_onnx_model:
# copy external weights (for models >2GB)
- src_model_path = self.model_save_dir.joinpath(
- ONNX_EXTERNAL_WEIGHTS_NAME)
+ src_model_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
if src_model_path.exists():
- dst_model_path = Path(save_directory).joinpath(
- ONNX_EXTERNAL_WEIGHTS_NAME)
+ dst_model_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
try:
shutil.copyfile(src_model_path, dst_model_path)
except shutil.SameFileError:
pass
if not is_onnx_model:
- src_params_path = self.model_save_dir.joinpath(
- self.latest_params_name)
+ src_params_path = self.model_save_dir.joinpath(self.latest_params_name)
dst_params_path = Path(save_directory).joinpath(params_file_name)
try:
shutil.copyfile(src_params_path, dst_params_path)
@@ -1332,9 +1301,10 @@ def _save_pretrained(
pass
def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- **kwargs, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ **kwargs,
+ ):
"""
Save a model to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class
method.:
@@ -1344,9 +1314,7 @@ def save_pretrained(
Directory to which to save. Will be created if it doesn't exist.
"""
if os.path.isfile(save_directory):
- logger.error(
- f"Provided path ({save_directory}) should be a directory, not a file"
- )
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
os.makedirs(save_directory, exist_ok=True)
@@ -1356,23 +1324,24 @@ def save_pretrained(
@classmethod
def _from_pretrained(
- cls,
- pretrained_model_name_or_path: Union[str, Path],
- model_file_name: Optional[str]=None,
- params_file_name: Optional[str]=None,
- use_auth_token: Optional[Union[bool, str, None]]=None,
- revision: Optional[str]=None,
- subfolder: Optional[str]=None,
- force_download: bool=False,
- cache_dir: Optional[str]=None,
- runtime_options: Optional["fd.RuntimeOption"]=None,
- from_hf_hub: Optional[bool]=False,
- proxies: Optional[Dict]=None,
- resume_download: bool=False,
- local_files_only: bool=False,
- user_agent: Union[Dict, str, None]=None,
- is_onnx_model: bool=False,
- **kwargs, ):
+ cls,
+ pretrained_model_name_or_path: Union[str, Path],
+ model_file_name: Optional[str] = None,
+ params_file_name: Optional[str] = None,
+ use_auth_token: Optional[Union[bool, str, None]] = None,
+ revision: Optional[str] = None,
+ subfolder: Optional[str] = None,
+ force_download: bool = False,
+ cache_dir: Optional[str] = None,
+ runtime_options: Optional["fd.RuntimeOption"] = None,
+ from_hf_hub: Optional[bool] = False,
+ proxies: Optional[Dict] = None,
+ resume_download: bool = False,
+ local_files_only: bool = False,
+ user_agent: Union[Dict, str, None] = None,
+ is_onnx_model: bool = False,
+ **kwargs,
+ ):
"""
Load a model from a directory or the HF Hub.
@@ -1404,24 +1373,25 @@ def _from_pretrained(
kwargs will be passed to the model during initialization
"""
- model_file_name = (model_file_name if model_file_name is not None else
- FASTDEPLOY_MODEL_NAME
- if not is_onnx_model else ONNX_WEIGHTS_NAME)
- params_file_name = (params_file_name if params_file_name is not None
- else FASTDEPLOY_WEIGHTS_NAME)
+ model_file_name = (
+ model_file_name
+ if model_file_name is not None
+ else FASTDEPLOY_MODEL_NAME
+ if not is_onnx_model
+ else ONNX_WEIGHTS_NAME
+ )
+ params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
kwargs["model_format"] = "ONNX" if is_onnx_model else "PADDLE"
# load model from local directory
if os.path.isdir(pretrained_model_name_or_path):
- model_path = os.path.join(pretrained_model_name_or_path,
- model_file_name)
- params_path = (
- None if is_onnx_model else
- os.path.join(pretrained_model_name_or_path, params_file_name))
+ model_path = os.path.join(pretrained_model_name_or_path, model_file_name)
+ params_path = None if is_onnx_model else os.path.join(pretrained_model_name_or_path, params_file_name)
model = FastDeployRuntimeModel.load_model(
model_path,
params_path,
- runtime_options=runtime_options, )
+ runtime_options=runtime_options,
+ )
kwargs["model_save_dir"] = Path(pretrained_model_name_or_path)
# load model from hub or paddle bos
else:
@@ -1437,7 +1407,8 @@ def _from_pretrained(
resume_download=resume_download,
local_files_only=local_files_only,
use_auth_token=use_auth_token,
- user_agent=user_agent, )
+ user_agent=user_agent,
+ )
if is_onnx_model:
params_cache_path = None
kwargs["latest_params_name"] = None
@@ -1454,7 +1425,8 @@ def _from_pretrained(
resume_download=resume_download,
local_files_only=local_files_only,
use_auth_token=use_auth_token,
- user_agent=user_agent, )
+ user_agent=user_agent,
+ )
kwargs["latest_params_name"] = Path(params_cache_path).name
kwargs["model_save_dir"] = Path(model_cache_path).parent
kwargs["latest_model_name"] = Path(model_cache_path).name
@@ -1462,21 +1434,24 @@ def _from_pretrained(
model = FastDeployRuntimeModel.load_model(
model_cache_path,
params_cache_path,
- runtime_options=runtime_options, )
+ runtime_options=runtime_options,
+ )
return cls(model=model, **kwargs)
@classmethod
def from_pretrained(
- cls,
- pretrained_model_name_or_path: Union[str, Path],
- model_file_name: Optional[str]=None,
- params_file_name: Optional[str]=None,
- runtime_options: Optional["fd.RuntimeOption"]=None,
- is_onnx_model: bool=False,
- **kwargs, ):
+ cls,
+ pretrained_model_name_or_path: Union[str, Path],
+ model_file_name: Optional[str] = None,
+ params_file_name: Optional[str] = None,
+ runtime_options: Optional["fd.RuntimeOption"] = None,
+ is_onnx_model: bool = False,
+ **kwargs,
+ ):
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
@@ -1508,4 +1483,5 @@ def from_pretrained(
local_files_only=local_files_only,
user_agent=user_agent,
is_onnx_model=is_onnx_model,
- **kwargs, )
+ **kwargs,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
index 0ebba5a459d49..dd119ef22d12e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
@@ -15,9 +15,11 @@
# flake8: noqa
from ...utils import is_paddlenlp_available
-from .pipeline_latent_diffusion_superresolution import \
- LDMSuperResolutionPipeline
+from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
if is_paddlenlp_available():
- from .pipeline_latent_diffusion import (LDMBertConfig, LDMBertModel,
- LDMTextToImagePipeline)
+ from .pipeline_latent_diffusion import (
+ LDMBertConfig,
+ LDMBertModel,
+ LDMTextToImagePipeline,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index f0d4f43308d80..e82dda6fe1de3 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -19,16 +19,20 @@
import paddle
import paddle.nn as nn
-from paddlenlp.transformers import (PretrainedConfig, PretrainedModel,
- PretrainedTokenizer, register_base_model)
-from paddlenlp.transformers.model_outputs import \
- BaseModelOutputWithPoolingAndCrossAttentions
+from paddlenlp.transformers import (
+ PretrainedConfig,
+ PretrainedModel,
+ PretrainedTokenizer,
+ register_base_model,
+)
+from paddlenlp.transformers.model_outputs import (
+ BaseModelOutputWithPoolingAndCrossAttentions,
+)
from ...configuration_utils import FrozenDict
from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import (deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
from ...utils.initializer_utils import normal_, zeros_
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -69,34 +73,29 @@ class LDMTextToImagePipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vqvae: Union[VQModel, AutoencoderKL],
- bert: PretrainedModel,
- tokenizer: PretrainedTokenizer,
- unet: Union[UNet2DModel, UNet2DConditionModel],
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler], ):
+ self,
+ vqvae: Union[VQModel, AutoencoderKL],
+ bert: PretrainedModel,
+ tokenizer: PretrainedTokenizer,
+ unet: Union[UNet2DModel, UNet2DConditionModel],
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -104,35 +103,25 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
if tokenizer.model_max_length > 77:
tokenizer.model_max_length = 77
- self.register_modules(
- vqvae=vqvae,
- bert=bert,
- tokenizer=tokenizer,
- unet=unet,
- scheduler=scheduler)
- self.vae_scale_factor = (
- 8 # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
- )
+ self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+ self.vae_scale_factor = 8 # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -168,21 +157,25 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because LDMBert can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- prompt_embeds = self.bert(text_input_ids, )
+ prompt_embeds = self.bert(
+ text_input_ids,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.bert.dtype)
@@ -190,8 +183,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -201,14 +193,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -218,28 +212,27 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- negative_prompt_embeds = self.bert(uncond_input.input_ids, )
+ negative_prompt_embeds = self.bert(
+ uncond_input.input_ids,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.bert.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.bert.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -257,53 +250,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -316,17 +305,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -349,26 +340,25 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=256,
- width: Optional[int]=256,
- num_inference_steps: int=50,
- guidance_scale: float=1.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ) -> Union[
- Tuple, ImagePipelineOutput]:
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = 256,
+ width: Optional[int] = 256,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 1.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ) -> Union[Tuple, ImagePipelineOutput]:
r"""
Function invoked when calling the pipeline for generation.
@@ -443,7 +433,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -465,7 +456,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -480,43 +472,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -533,7 +520,7 @@ def __call__(
image = self.decode_latents(latents)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
@@ -554,25 +541,26 @@ class LDMBertConfig(PretrainedConfig):
}
def __init__(
- self,
- vocab_size=30522,
- max_position_embeddings=77,
- encoder_layers=32,
- encoder_ffn_dim=5120,
- encoder_attention_heads=8,
- head_dim=64,
- encoder_layerdrop=0.0,
- activation_function="gelu",
- d_model=1280,
- dropout=0.1,
- attention_dropout=0.0,
- activation_dropout=0.0,
- init_std=0.02,
- classifier_dropout=0.0,
- scale_embedding=False,
- use_cache=True,
- pad_token_id=0,
- **kwargs, ):
+ self,
+ vocab_size=30522,
+ max_position_embeddings=77,
+ encoder_layers=32,
+ encoder_ffn_dim=5120,
+ encoder_attention_heads=8,
+ head_dim=64,
+ encoder_layerdrop=0.0,
+ activation_function="gelu",
+ d_model=1280,
+ dropout=0.1,
+ attention_dropout=0.0,
+ activation_dropout=0.0,
+ init_std=0.02,
+ classifier_dropout=0.0,
+ scale_embedding=False,
+ use_cache=True,
+ pad_token_id=0,
+ **kwargs,
+ ):
kwargs["return_dict"] = kwargs.pop("return_dict", True)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
@@ -590,9 +578,7 @@ def __init__(
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
- self.scale_embedding = (
- scale_embedding # scale factor will be sqrt(d_model) if True
- )
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -603,9 +589,7 @@ class LDMBertPretrainedModel(PretrainedModel):
base_model_prefix = "ldmbert"
config_class = LDMBertConfig
_supports_gradient_checkpointing = True
- _keys_to_ignore_on_load_unexpected = [
- r"encoder\.version", r"decoder\.version"
- ]
+ _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
def init_weights(self):
"""
@@ -626,9 +610,7 @@ def gradient_checkpointing_enable(self):
activations".
"""
if not self.supports_gradient_checkpointing:
- raise ValueError(
- f"{self.__class__.__name__} does not support gradient checkpointing."
- )
+ raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
self.apply(partial(self._set_gradient_checkpointing, value=True))
def gradient_checkpointing_disable(self):
@@ -656,15 +638,15 @@ def _init_weights(self, module):
class LDMBertEmbeddings(nn.Layer):
def __init__(
- self,
- vocab_size,
- hidden_size=768,
- hidden_dropout_prob=0.0,
- max_position_embeddings=512, ):
+ self,
+ vocab_size,
+ hidden_size=768,
+ hidden_dropout_prob=0.0,
+ max_position_embeddings=512,
+ ):
super().__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
- self.position_embeddings = nn.Embedding(max_position_embeddings,
- hidden_size)
+ self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, input_ids, position_ids=None):
@@ -684,18 +666,19 @@ def forward(self, input_ids, position_ids=None):
class TransformerEncoderLayer(nn.TransformerEncoderLayer):
def __init__(
- self,
- d_model,
- nhead,
- dim_feedforward,
- dropout=0.1,
- activation="gelu",
- attn_dropout=None,
- act_dropout=None,
- normalize_before=False,
- weight_attr=None,
- bias_attr=None,
- head_dim=64, ):
+ self,
+ d_model,
+ nhead,
+ dim_feedforward,
+ dropout=0.1,
+ activation="gelu",
+ attn_dropout=None,
+ act_dropout=None,
+ normalize_before=False,
+ weight_attr=None,
+ bias_attr=None,
+ head_dim=64,
+ ):
super().__init__(
d_model,
nhead,
@@ -706,7 +689,8 @@ def __init__(
act_dropout,
normalize_before,
weight_attr,
- bias_attr, )
+ bias_attr,
+ )
# update self attn
self.self_attn = LDMBertAttention(
d_model,
@@ -714,7 +698,8 @@ def __init__(
nhead,
dropout=attn_dropout,
weight_attr=weight_attr,
- bias_attr=False, )
+ bias_attr=False,
+ )
@register_base_model
@@ -727,7 +712,8 @@ def __init__(self, config: LDMBertConfig):
config.vocab_size,
config.d_model,
config.dropout,
- config.max_position_embeddings, )
+ config.max_position_embeddings,
+ )
encoder_layer = TransformerEncoderLayer(
config.d_model,
config.encoder_attention_heads,
@@ -737,10 +723,10 @@ def __init__(self, config: LDMBertConfig):
attn_dropout=config.attention_dropout,
act_dropout=config.activation_dropout,
normalize_before=True,
- head_dim=config.head_dim, )
+ head_dim=config.head_dim,
+ )
- self.encoder = nn.TransformerEncoder(encoder_layer,
- config.encoder_layers)
+ self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
self.final_layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
@@ -751,56 +737,58 @@ def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def forward(
- self,
- input_ids,
- position_ids=None,
- attention_mask=None,
- output_hidden_states=False,
- output_attentions=False,
- return_dict=False, ):
+ self,
+ input_ids,
+ position_ids=None,
+ attention_mask=None,
+ output_hidden_states=False,
+ output_attentions=False,
+ return_dict=False,
+ ):
if attention_mask is not None and attention_mask.ndim == 2:
# attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
- attention_mask = attention_mask.unsqueeze(
- axis=[1, 2]).astype(paddle.get_default_dtype())
+ attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
attention_mask = (1.0 - attention_mask) * -1e4
- embedding_output = self.embeddings(
- input_ids=input_ids, position_ids=position_ids)
+ embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
encoder_outputs = self.encoder(
embedding_output,
src_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
if isinstance(encoder_outputs, type(embedding_output)):
sequence_output = self.final_layer_norm(encoder_outputs)
- return (sequence_output, )
+ return (sequence_output,)
else:
sequence_output = encoder_outputs[0]
sequence_output = self.final_layer_norm(sequence_output)
if not return_dict:
- return (sequence_output, ) + encoder_outputs[1:]
+ return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions, )
+ attentions=encoder_outputs.attentions,
+ )
class LDMBertAttention(nn.MultiHeadAttention):
def __init__(
- self,
- embed_dim,
- head_dim,
- num_heads,
- dropout=0.0,
- kdim=None,
- vdim=None,
- need_weights=False,
- weight_attr=None,
- bias_attr=None, ):
+ self,
+ embed_dim,
+ head_dim,
+ num_heads,
+ dropout=0.0,
+ kdim=None,
+ vdim=None,
+ need_weights=False,
+ weight_attr=None,
+ bias_attr=None,
+ ):
super().__init__(
embed_dim,
num_heads,
@@ -809,15 +797,10 @@ def __init__(
vdim,
need_weights,
weight_attr,
- bias_attr, )
- assert (
- embed_dim > 0
- ), "Expected embed_dim to be greater than 0, " "but received {}".format(
- embed_dim)
- assert (
- num_heads > 0
- ), "Expected num_heads to be greater than 0, " "but received {}".format(
- num_heads)
+ bias_attr,
+ )
+ assert embed_dim > 0, "Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)
+ assert num_heads > 0, "Expected num_heads to be greater than 0, " "but received {}".format(num_heads)
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
@@ -830,12 +813,9 @@ def __init__(
self.inner_dim = head_dim * num_heads
self.scaling = self.head_dim**-0.5
- self.q_proj = nn.Linear(
- embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr)
- self.k_proj = nn.Linear(
- self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
- self.v_proj = nn.Linear(
- self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+ self.q_proj = nn.Linear(embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+ self.k_proj = nn.Linear(self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+ self.v_proj = nn.Linear(self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
self.out_proj = nn.Linear(self.inner_dim, embed_dim, weight_attr)
@@ -847,18 +827,20 @@ def __init__(self, config: LDMBertConfig):
self.init_weights()
def forward(
- self,
- input_ids=None,
- attention_mask=None,
- position_ids=None,
- output_attentions=None,
- output_hidden_states=None,
- return_dict=None, ):
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
outputs = self.ldmbert(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
return outputs
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
index 0f37d4a18387d..24475c0af099b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -21,8 +21,13 @@
from ...models import UNet2DModel, VQModel
from ...schedulers import (
- DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler)
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
from ...utils import PIL_INTERPOLATION, randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -55,27 +60,32 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vqvae: VQModel,
- unet: UNet2DModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
- EulerDiscreteScheduler,
- EulerAncestralDiscreteScheduler,
- DPMSolverMultistepScheduler, ], ):
+ self,
+ vqvae: VQModel,
+ unet: UNet2DModel,
+ scheduler: Union[
+ DDIMScheduler,
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ EulerDiscreteScheduler,
+ EulerAncestralDiscreteScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ ):
super().__init__()
self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
@paddle.no_grad()
def __call__(
- self,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- batch_size: Optional[int]=1,
- num_inference_steps: Optional[int]=100,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ) -> Union[Tuple, ImagePipelineOutput]:
+ self,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ batch_size: Optional[int] = 1,
+ num_inference_steps: Optional[int] = 100,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[Tuple, ImagePipelineOutput]:
"""
Args:
image (`paddle.Tensor` or `PIL.Image.Image`):
@@ -107,25 +117,20 @@ def __call__(
elif isinstance(image, paddle.Tensor):
batch_size = image.shape[0]
else:
- raise ValueError(
- f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}"
- )
+ raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}")
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
height, width = image.shape[-2:]
# in_channels should be 6: 3 for latents, 3 for low resolution image
- latents_shape = (batch_size, self.unet.config.in_channels // 2, height,
- width)
+ latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
latents_dtype = self.unet.dtype
- latents = randn_tensor(
- latents_shape, generator=generator, dtype=latents_dtype)
+ latents = randn_tensor(latents_shape, generator=generator, dtype=latents_dtype)
image = image.cast(latents_dtype)
self.scheduler.set_timesteps(num_inference_steps)
timesteps_tensor = self.scheduler.timesteps
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_kwargs = {}
if accepts_eta:
extra_kwargs["eta"] = eta
@@ -136,8 +141,7 @@ def __call__(
# predict the noise residual
noise_pred = self.unet(latents_input, t).sample
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
# decode the image latents with the VQVAE
image = self.vqvae.decode(latents).sample
@@ -147,5 +151,5 @@ def __call__(
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index 5434c0cbb084e..11e66b2063f75 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -36,23 +36,21 @@ class LDMPipeline(DiffusionPipeline):
[`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
"""
- def __init__(self,
- vqvae: VQModel,
- unet: UNet2DModel,
- scheduler: DDIMScheduler):
+ def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
super().__init__()
self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
@paddle.no_grad()
- def __call__(self,
- batch_size: int=1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- eta: float=0.0,
- num_inference_steps: int=50,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs) -> Union[Tuple, ImagePipelineOutput]:
+ def __call__(
+ self,
+ batch_size: int = 1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ eta: float = 0.0,
+ num_inference_steps: int = 50,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs
+ ) -> Union[Tuple, ImagePipelineOutput]:
"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -77,8 +75,10 @@ def __call__(self,
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, ),
- generator=generator, )
+ self.unet.config.sample_size,
+ ),
+ generator=generator,
+ )
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
@@ -86,8 +86,7 @@ def __call__(self,
self.scheduler.set_timesteps(num_inference_steps)
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_kwargs = {}
if accepts_eta:
extra_kwargs["eta"] = eta
@@ -96,13 +95,12 @@ def __call__(self,
# predict the noise residual
noise_prediction = self.unet(latent_model_input, t).sample
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_prediction, t, latents,
- **extra_kwargs).prev_sample
+ latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
image = self.vqvae.decode(latents).sample
image = (image / 2 + 0.5).clip(min=0, max=1)
image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
index a3967f589a49a..3d31b5e95e74f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
@@ -19,8 +19,12 @@
import numpy as np
import paddle
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
- is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+ BaseOutput,
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
@dataclass
@@ -45,7 +49,7 @@ class VideoPipelineOutput(BaseOutput):
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_paddlenlp_objects import *
else:
- from .pipeline_latent_video_diffusion_model_text2video import \
- LVDMTextToVideoPipeline
- from .pipeline_latent_video_diffusion_model_uncond import \
- LVDMUncondPipeline
+ from .pipeline_latent_video_diffusion_model_text2video import (
+ LVDMTextToVideoPipeline,
+ )
+ from .pipeline_latent_video_diffusion_model_uncond import LVDMUncondPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
index a727ada59472b..8e339ecfee43d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
@@ -24,8 +24,7 @@
from ...configuration_utils import FrozenDict
from ...models import LVDMAutoencoderKL, LVDMUNet3DModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from . import VideoPipelineOutput
from .video_save import save_results
@@ -43,12 +42,12 @@
prompt="cutting in kitchen",
num_frames=16,
height=256,
- width=256,
- num_inference_steps=50,
- generator=generator,
+ width=256,
+ num_inference_steps=50,
+ generator=generator,
guidance_scale=15,
- eta=1,
- save_dir='.',
+ eta=1,
+ save_dir='.',
save_name='ddim_lvdm_text_to_video_ucf',
encoder_type='2d',
scale_factor=0.18215,
@@ -64,12 +63,10 @@ def split_video_to_clips(video, clip_length, drop_left=True):
video_length = video.shape[2]
shape = video.shape
if video_length % clip_length != 0 and drop_left:
- video = video[:, :, :video_length // clip_length * clip_length, :, :]
- print(
- f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
+ video = video[:, :, : video_length // clip_length * clip_length, :, :]
+ print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
nclips = video_length // clip_length
- clips = rearrange(
- video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
+ clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
return clips
@@ -104,34 +101,30 @@ class LVDMTextToVideoPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: LVDMAutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: LVDMUNet3DModel,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ vae: LVDMAutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: LVDMUNet3DModel,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -139,11 +132,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -153,7 +142,8 @@ def __init__(
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
# self.encoder_type = '2d'
# self.scale_factor = 0.18215
@@ -166,12 +156,7 @@ def decode(self, z, **kwargs):
return results
@paddle.no_grad()
- def overlapped_decode(self,
- z,
- max_z_t=None,
- overlap_t=2,
- predict_cids=False,
- force_not_quantize=False):
+ def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False):
if max_z_t is None:
max_z_t = z.shape[2]
assert max_z_t > overlap_t
@@ -190,69 +175,56 @@ def overlapped_decode(self,
reses = []
for i, z_ in enumerate(zs):
if i == 0:
- res = self.decode(
- z_, predict_cids,
- force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :]
elif i == len(zs) - 1:
- res = self.decode(
- z_, predict_cids,
- force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
else:
- res = self.decode(z_, predict_cids, force_not_quantize).cpu(
- )[:, :, drop_l_x:max_x_t - drop_r_x, :, :]
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[
+ :, :, drop_l_x : max_x_t - drop_r_x, :, :
+ ]
reses.append(res)
results = paddle.concat(x=reses, axis=2)
return results
@paddle.no_grad()
- def decode_first_stage_2DAE_video(self,
- z,
- decode_bs=16,
- return_cpu=True,
- **kwargs):
+ def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs):
b, _, t, _, _ = z.shape
z = rearrange(z, "b c t h w -> (b t) c h w")
if decode_bs is None:
results = self.decode(z, **kwargs)
else:
- z = paddle.split(
- x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
if return_cpu:
- results = paddle.concat(
- x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
+ results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
else:
- results = paddle.concat(
- x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
- results = rearrange(
- results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
+ results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
+ results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
return results
@paddle.no_grad()
def decode_latents(
- self,
- z,
- decode_bs=16,
- return_cpu=True,
- bs=None,
- decode_single_video_allframes=False,
- max_z_t=None,
- overlapped_length=0,
- **kwargs, ):
+ self,
+ z,
+ decode_bs=16,
+ return_cpu=True,
+ bs=None,
+ decode_single_video_allframes=False,
+ max_z_t=None,
+ overlapped_length=0,
+ **kwargs,
+ ):
b, _, t, _, _ = z.shape
if kwargs["encoder_type"] == "2d" and z.dim() == 5:
- return self.decode_first_stage_2DAE_video(
- z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
+ return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
if decode_single_video_allframes:
z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0)
cat_dim = 0
elif max_z_t is not None:
if kwargs["encoder_type"] == "3d":
- z = paddle.split(
- x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
+ z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
cat_dim = 2
if kwargs["encoder_type"] == "2d":
- z = paddle.split(
- x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
cat_dim = 0
# elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[
# 2
@@ -286,8 +258,7 @@ def paddle_to_np(self, x):
if isinstance("uint8", paddle.dtype):
dtype = "uint8"
- elif isinstance("uint8",
- str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+ elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
dtype = "uint8"
elif isinstance("uint8", paddle.Tensor):
dtype = "uint8".dtype
@@ -299,13 +270,14 @@ def paddle_to_np(self, x):
return sample
def _encode_prompt(
- self,
- prompt,
- num_videos_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_videos_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -341,28 +313,30 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -370,8 +344,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_videos_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_videos_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_videos_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -381,14 +354,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -398,36 +373,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_videos_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_videos_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_videos_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_videos_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -437,53 +409,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -496,22 +464,21 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- num_frames,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
- shape = [
- batch_size, num_channels_latents, num_frames, height // 8,
- width // 8
- ]
+ self,
+ batch_size,
+ num_channels_latents,
+ num_frames,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
+ shape = [batch_size, num_channels_latents, num_frames, height // 8, width // 8]
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -528,31 +495,31 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=256,
- width: Optional[int]=256,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_videos_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- save_dir=None,
- save_name=None,
- num_frames: Optional[int]=16,
- encoder_type="2d",
- scale_factor=0.18215,
- shift_factor=0, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = 256,
+ width: Optional[int] = 256,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_videos_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ save_dir=None,
+ save_name=None,
+ num_frames: Optional[int] = 16,
+ encoder_type="2d",
+ scale_factor=0.18215,
+ shift_factor=0,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -628,9 +595,7 @@ def __call__(
"""
# 0. Default height and width to unet
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# 1. Check inputs. Raise error if not correct
self.check_inputs(
@@ -640,7 +605,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -662,7 +628,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -678,43 +645,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
timesteps=t,
context=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -724,8 +686,7 @@ def __call__(
"scale_factor": scale_factor,
"shift_factor": shift_factor,
}
- sampled_videos = self.decode_latents(
- latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs)
+ sampled_videos = self.decode_latents(latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs)
all_videos.append(self.paddle_to_np(sampled_videos))
all_videos = np.concatenate(all_videos, axis=0)
@@ -744,10 +705,9 @@ def __call__(
videos_frames.append(video_frames)
if not save_name:
- save_name = f"defaul_video"
+ save_name = "defaul_video"
if not save_dir:
save_dir = "."
os.makedirs(save_dir, exist_ok=True)
- save_results(
- all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
+ save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
index 5581777325761..3d64085312440 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
@@ -19,9 +19,6 @@
import numpy as np
import paddle
-import paddle.nn as nn
-from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer
-from tqdm import trange
from ...configuration_utils import FrozenDict
from ...models import LVDMAutoencoderKL, LVDMUNet3DModel
@@ -49,34 +46,29 @@ class LVDMUncondPipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: LVDMAutoencoderKL,
- unet: LVDMUNet3DModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler], ):
+ self,
+ vae: LVDMAutoencoderKL,
+ unet: LVDMUNet3DModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
self.register_modules(vae=vae, unet=unet, scheduler=scheduler)
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
@@ -113,8 +105,7 @@ def paddle_to_np(self, x):
if isinstance("uint8", paddle.dtype):
dtype = "uint8"
- elif isinstance("uint8",
- str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+ elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
dtype = "uint8"
elif isinstance("uint8", paddle.Tensor):
dtype = "uint8".dtype
@@ -127,25 +118,25 @@ def paddle_to_np(self, x):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- num_frames: Optional[int]=16,
- height: Optional[int]=256,
- width: Optional[int]=256,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- eta: Optional[float]=0.0,
- num_inference_steps: Optional[int]=50,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- save_dir=None,
- save_name=None,
- scale_factor: Optional[float]=0.33422927,
- shift_factor: Optional[float]=1.4606637,
- **kwargs, ) -> Union[Tuple, VideoPipelineOutput]:
+ self,
+ batch_size: int = 1,
+ num_frames: Optional[int] = 16,
+ height: Optional[int] = 256,
+ width: Optional[int] = 256,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ eta: Optional[float] = 0.0,
+ num_inference_steps: Optional[int] = 50,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ save_dir=None,
+ save_name=None,
+ scale_factor: Optional[float] = 0.33422927,
+ shift_factor: Optional[float] = 1.4606637,
+ **kwargs,
+ ) -> Union[Tuple, VideoPipelineOutput]:
r"""
Args:
height (`int`, *optional*, defaults to 256):
@@ -188,16 +179,15 @@ def __call__(
"""
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# get the initial random noise unless the user supplied it
latents_shape = [
@@ -211,12 +201,11 @@ def __call__(
if latents is None:
latents = randn_tensor(
latents_shape,
- generator=generator, )
+ generator=generator,
+ )
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -231,30 +220,26 @@ def __call__(
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
latent_model_input = latents
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
t_tensor = paddle.expand(
t,
- [latent_model_input.shape[0], ], )
+ [
+ latent_model_input.shape[0],
+ ],
+ )
# predict the noise residual
noise_pred = self.unet(latent_model_input, t_tensor).sample
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- generator=generator,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, generator=generator, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -281,10 +266,9 @@ def __call__(
videos_frames.append(video_frames)
if not save_name:
- save_name = f"defaul_video"
+ save_name = "default_video"
if not save_dir:
save_dir = "."
os.makedirs(save_dir, exist_ok=True)
- save_results(
- all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
+ save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
index 837050f0222df..a969643113c68 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
@@ -33,12 +33,9 @@
av.logging.set_level(av.logging.ERROR)
if not hasattr(av.video.frame.VideoFrame, "pict_type"):
- av = ImportError(
- """Your version of PyAV is too old for the necessary video operations."""
- )
+ av = ImportError("""Your version of PyAV is too old for the necessary video operations.""")
except ImportError:
- av = ImportError(
- """PyAV is not installed, and is necessary for the video operations.""")
+ av = ImportError("""PyAV is not installed, and is necessary for the video operations.""")
def _check_av_available() -> None:
@@ -47,15 +44,16 @@ def _check_av_available() -> None:
def write_video(
- filename: str,
- video_array: paddle.Tensor,
- fps: float,
- video_codec: str="libx264",
- options: Optional[Dict[str, Any]]=None,
- audio_array: Optional[paddle.Tensor]=None,
- audio_fps: Optional[float]=None,
- audio_codec: Optional[str]=None,
- audio_options: Optional[Dict[str, Any]]=None, ) -> None:
+ filename: str,
+ video_array: paddle.Tensor,
+ fps: float,
+ video_codec: str = "libx264",
+ options: Optional[Dict[str, Any]] = None,
+ audio_array: Optional[paddle.Tensor] = None,
+ audio_fps: Optional[float] = None,
+ audio_codec: Optional[str] = None,
+ audio_options: Optional[Dict[str, Any]] = None,
+) -> None:
"""
Writes a 4d tensor in [T, H, W, C] format in a video file
@@ -101,10 +99,8 @@ def write_video(
audio_layout = "stereo" if num_channels > 1 else "mono"
audio_sample_fmt = container.streams.audio[0].format.name
format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
- audio_array = (
- paddle.to_tensor(data=audio_array).numpy().astype(format_dtype))
- frame = av.AudioFrame.from_ndarray(
- audio_array, format=audio_sample_fmt, layout=audio_layout)
+ audio_array = paddle.to_tensor(data=audio_array).numpy().astype(format_dtype)
+ frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout)
frame.sample_rate = audio_fps
for packet in a_stream.encode(frame):
container.mux(packet)
@@ -121,13 +117,14 @@ def write_video(
@paddle.no_grad()
def make_grid(
- tensor: Union[paddle.Tensor, List[paddle.Tensor]],
- nrow: int=8,
- padding: int=2,
- normalize: bool=False,
- value_range: Optional[Tuple[int, int]]=None,
- scale_each: bool=False,
- pad_value: float=0.0, ) -> paddle.Tensor:
+ tensor: Union[paddle.Tensor, List[paddle.Tensor]],
+ nrow: int = 8,
+ padding: int = 2,
+ normalize: bool = False,
+ value_range: Optional[Tuple[int, int]] = None,
+ scale_each: bool = False,
+ pad_value: float = 0.0,
+) -> paddle.Tensor:
"""
Make a grid of images.
@@ -153,12 +150,9 @@ def make_grid(
if isinstance(tensor, list):
for t in tensor:
if not paddle.is_tensor(x=t):
- raise TypeError(
- f"tensor or list of tensors expected, got a list containing {type(t)}"
- )
+ raise TypeError(f"tensor or list of tensors expected, got a list containing {type(t)}")
else:
- raise TypeError(
- f"tensor or list of tensors expected, got {type(tensor)}")
+ raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
if isinstance(tensor, list):
tensor = paddle.stack(x=tensor, axis=0)
if tensor.dim() == 2:
@@ -172,9 +166,7 @@ def make_grid(
if normalize is True:
tensor = tensor.clone()
if value_range is not None and not isinstance(value_range, tuple):
- raise TypeError(
- "value_range has to be a tuple (min, max) if specified. min and max are numbers"
- )
+ raise TypeError("value_range has to be a tuple (min, max) if specified. min and max are numbers")
def norm_ip(img, low, high):
img.clip_(min=low, max=high)
@@ -198,32 +190,33 @@ def norm_range(t, value_range):
nmaps = tensor.shape[0]
xmaps = min(nrow, nmaps)
ymaps = int(math.ceil(float(nmaps) / xmaps))
- height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] +
- padding)
+ height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] + padding)
num_channels = tensor.shape[1]
grid = paddle.full(
shape=(num_channels, height * ymaps + padding, width * xmaps + padding),
fill_value=pad_value,
- dtype=tensor.dtype, )
+ dtype=tensor.dtype,
+ )
k = 0
for y in range(ymaps):
for x in range(xmaps):
if k >= nmaps:
break
- start_0 = (grid.shape[1] + y * height + padding
- if y * height + padding < 0 else y * height + padding)
- start_1 = (paddle.slice(grid, [1], [start_0],
- [start_0 + height - padding]).shape[2] + x *
- width + padding
- if x * width + padding < 0 else x * width + padding)
+ start_0 = grid.shape[1] + y * height + padding if y * height + padding < 0 else y * height + padding
+ start_1 = (
+ paddle.slice(grid, [1], [start_0], [start_0 + height - padding]).shape[2] + x * width + padding
+ if x * width + padding < 0
+ else x * width + padding
+ )
paddle.assign(
tensor[k],
output=paddle.slice(
- paddle.slice(grid, [1], [start_0],
- [start_0 + height - padding]),
+ paddle.slice(grid, [1], [start_0], [start_0 + height - padding]),
[2],
[start_1],
- [start_1 + width - padding], ), )
+ [start_1 + width - padding],
+ ),
+ )
k = k + 1
return grid
@@ -264,13 +257,12 @@ def to_tensor(pic) -> paddle.Tensor:
if img.dtype == paddle.uint8:
return paddle.divide(
img.cast(default_float_dtype),
- paddle.to_tensor(
- 255, dtype=paddle.float32), )
+ paddle.to_tensor(255, dtype=paddle.float32),
+ )
else:
return img
mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32}
- img = paddle.to_tensor(data=np.array(
- pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
+ img = paddle.to_tensor(data=np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
if pic.mode == "1":
img = 255 * img
img = img.reshape([pic.size[1], pic.size[0], get_image_num_channels(pic)])
@@ -299,20 +291,21 @@ def fill_with_black_squares(video, desired_len: int) -> paddle.Tensor:
return paddle.concat(
x=[
video,
- paddle.zeros_like(x=video[0]).unsqueeze(axis=0)
- .tile(repeat_times=[desired_len - len(video), 1, 1, 1]),
+ paddle.zeros_like(x=video[0]).unsqueeze(axis=0).tile(repeat_times=[desired_len - len(video), 1, 1, 1]),
],
- axis=0, )
+ axis=0,
+ )
def npz_to_video_grid(
- data_path,
- out_path,
- num_frames=None,
- fps=8,
- num_videos=None,
- nrow=None,
- verbose=True, ):
+ data_path,
+ out_path,
+ num_frames=None,
+ fps=8,
+ num_videos=None,
+ nrow=None,
+ verbose=True,
+):
if isinstance(data_path, str):
videos = load_num_videos(data_path, num_videos)
elif isinstance(data_path, np.ndarray):
@@ -332,22 +325,14 @@ def npz_to_video_grid(
if num_frames is None:
num_frames = videos.shape[1]
if verbose:
- videos = [
- fill_with_black_squares(v, num_frames)
- for v in tqdm(
- videos_th, desc="Adding empty frames")
- ]
+ videos = [fill_with_black_squares(v, num_frames) for v in tqdm(videos_th, desc="Adding empty frames")]
else:
videos = [fill_with_black_squares(v, num_frames) for v in videos_th]
frame_grids = paddle.stack(x=videos).transpose(perm=[1, 0, 2, 3, 4])
if nrow is None:
nrow = int(np.ceil(np.sqrt(n)))
if verbose:
- frame_grids = [
- make_grid(
- fs, nrow=nrow) for fs in tqdm(
- frame_grids, desc="Making grids")
- ]
+ frame_grids = [make_grid(fs, nrow=nrow) for fs in tqdm(frame_grids, desc="Making grids")]
else:
frame_grids = [make_grid(fs, nrow=nrow) for fs in frame_grids]
@@ -356,21 +341,14 @@ def npz_to_video_grid(
os.makedirs(os.path.dirname(out_path), exist_ok=True)
if isinstance("uint8", paddle.dtype):
dtype = "uint8"
- elif isinstance("uint8",
- str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+ elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
dtype = "uint8"
elif isinstance("uint8", paddle.Tensor):
dtype = "uint8".dtype
else:
dtype = (paddle.stack(x=frame_grids) * 255).dtype
- frame_grids = ((paddle.stack(x=frame_grids) * 255).transpose(
- perm=[0, 2, 3, 1]).cast(dtype))
- write_video(
- out_path,
- frame_grids,
- fps=fps,
- video_codec="h264",
- options={"crf": "10"})
+ frame_grids = (paddle.stack(x=frame_grids) * 255).transpose(perm=[0, 2, 3, 1]).cast(dtype)
+ write_video(out_path, frame_grids, fps=fps, video_codec="h264", options={"crf": "10"})
def savenp2sheet(imgs, savepath, nrow=None):
@@ -398,10 +376,7 @@ def savenp2sheet(imgs, savepath, nrow=None):
n_rows = int(np.ceil(n / n_cols))
print(n_cols)
print(n_rows)
- imgsheet = cv2.vconcat([
- cv2.hconcat(imgs_new[i * n_cols:(i + 1) * n_cols])
- for i in range(n_rows)
- ])
+ imgsheet = cv2.vconcat([cv2.hconcat(imgs_new[i * n_cols : (i + 1) * n_cols]) for i in range(n_rows)])
cv2.imwrite(savepath, imgsheet)
print(f"saved in {savepath}")
@@ -414,7 +389,7 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None):
else:
raise Exception
if os.path.isdir(res_dir):
- res_path = os.path.join(res_dir, f"samples.jpg")
+ res_path = os.path.join(res_dir, "samples.jpg")
else:
assert res_dir.endswith(".jpg")
res_path = res_dir
@@ -423,24 +398,25 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None):
def save_results(
- videos,
- save_dir,
- save_name="results",
- save_fps=8,
- save_mp4=True,
- save_npz=False,
- save_mp4_sheet=False,
- save_jpg=False, ):
+ videos,
+ save_dir,
+ save_name="results",
+ save_fps=8,
+ save_mp4=True,
+ save_npz=False,
+ save_mp4_sheet=False,
+ save_jpg=False,
+):
if save_mp4:
save_subdir = os.path.join(save_dir, "videos")
os.makedirs(save_subdir, exist_ok=True)
shape_str = "x".join([str(x) for x in videos[0:1, (...)].shape])
for i in range(videos.shape[0]):
npz_to_video_grid(
- videos[i:i + 1, (...)],
- os.path.join(save_subdir,
- f"{save_name}_{i:03d}_{shape_str}.mp4"),
- fps=save_fps, )
+ videos[i : i + 1, (...)],
+ os.path.join(save_subdir, f"{save_name}_{i:03d}_{shape_str}.mp4"),
+ fps=save_fps,
+ )
print(f"Successfully saved videos in {save_subdir}")
shape_str = "x".join([str(x) for x in videos.shape])
if save_npz:
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
index 4ba5c5d72ec8e..713ed5d8191b5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
@@ -14,8 +14,11 @@
# limitations under the License.
import paddle
from paddle import nn
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
- CLIPVisionModel)
+from paddlenlp.transformers import (
+ CLIPPretrainedModel,
+ CLIPVisionConfig,
+ CLIPVisionModel,
+)
from ...models.attention import BasicTransformerBlock
from ...utils import logging
@@ -42,8 +45,8 @@ def __init__(self, config: CLIPVisionConfig, proj_size=None):
self.uncond_vector = self.create_parameter(
[1, 1, self.projection_dim],
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Assign(
- paddle.rand((1, 1, self.projection_dim))), )
+ default_initializer=nn.initializer.Assign(paddle.rand((1, 1, self.projection_dim))),
+ )
def forward(self, pixel_values, return_uncond_vector=False):
clip_output = self.model(pixel_values=pixel_values)
@@ -63,14 +66,18 @@ def __init__(self, config: CLIPVisionConfig):
num_layers = (config.num_hidden_layers + 1) // 5
hid_size = config.hidden_size
num_heads = 1
- self.blocks = nn.LayerList([
- BasicTransformerBlock(
- hid_size,
- num_heads,
- hid_size,
- activation_fn="gelu",
- attention_bias=True, ) for _ in range(num_layers)
- ])
+ self.blocks = nn.LayerList(
+ [
+ BasicTransformerBlock(
+ hid_size,
+ num_heads,
+ hid_size,
+ activation_fn="gelu",
+ attention_bias=True,
+ )
+ for _ in range(num_layers)
+ ]
+ )
def forward(self, hidden_states):
for block in self.blocks:
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index f6b679c76b433..8ed3770065a18 100644
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -62,14 +62,11 @@ def prepare_mask_and_masked_image(image, mask):
"""
if isinstance(image, paddle.Tensor):
if not isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
- )
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
# Batch single image
if image.ndim == 3:
- assert (image.shape[0] == 3
- ), "Image outside a batch should be of shape (3, H, W)"
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
# Batch and add channel dim for single mask
@@ -84,12 +81,9 @@ def prepare_mask_and_masked_image(image, mask):
else:
mask = mask.unsqueeze(0)
- assert (image.ndim == 4 and
- mask.ndim == 4), "Image and Mask must have 4 dimensions"
- assert (image.shape[-2:] == mask.shape[-2:]
- ), "Image and Mask must have the same spatial dimensions"
- assert (image.shape[0] == mask.shape[0]
- ), "Image and Mask must have the same batch size"
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
assert mask.shape[1] == 1, "Mask image must have a single channel"
# Check image is in [-1, 1]
@@ -109,14 +103,12 @@ def prepare_mask_and_masked_image(image, mask):
# Image as float32
image = image.cast(paddle.float32)
elif isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
else:
if isinstance(image, PIL.Image.Image):
image = [image]
- image = np.concatenate(
- [np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
+ image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
image = image.transpose(0, 3, 1, 2)
image = paddle.to_tensor(image).cast(paddle.float32) / 127.5 - 1.0
@@ -124,8 +116,7 @@ def prepare_mask_and_masked_image(image, mask):
if isinstance(mask, PIL.Image.Image):
mask = [mask]
- mask = np.concatenate(
- [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
# paint-by-example inverses the mask
@@ -170,15 +161,15 @@ class PaintByExamplePipeline(DiffusionPipeline):
_optional_components = ["safety_checker"]
def __init__(
- self,
- vae: AutoencoderKL,
- image_encoder: PaintByExampleImageEncoder,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler,
- LMSDiscreteScheduler],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae: AutoencoderKL,
+ image_encoder: PaintByExampleImageEncoder,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
self.register_modules(
@@ -187,18 +178,18 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -210,15 +201,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -234,40 +223,44 @@ def decode_latents(self, latents):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
def check_inputs(self, image, height, width, callback_steps):
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
- f" {type(image)}")
+ f" {type(image)}"
+ )
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -283,22 +276,22 @@ def prepare_latents(
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
def prepare_mask_latents(
- self,
- mask,
- masked_image,
- batch_size,
- height,
- width,
- dtype,
- generator,
- do_classifier_free_guidance, ):
+ self,
+ mask,
+ masked_image,
+ batch_size,
+ height,
+ width,
+ dtype,
+ generator,
+ do_classifier_free_guidance,
+ ):
# resize the mask to latents shape as we concatenate the mask to the latents
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
# and half precision
mask = paddle.nn.functional.interpolate(
- mask,
- size=(height // self.vae_scale_factor,
- width // self.vae_scale_factor))
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+ )
mask = mask.cast(dtype)
masked_image = masked_image.cast(dtype)
@@ -306,13 +299,12 @@ def prepare_mask_latents(
# encode the mask image into latents space so we can concatenate it to the latents
if isinstance(generator, list):
masked_image_latents = [
- self.vae.encode(masked_image[i:i + 1]).latent_dist.sample(
- generator=generator[i]) for i in range(batch_size)
+ self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
]
masked_image_latents = paddle.concat(masked_image_latents, axis=0)
else:
- masked_image_latents = self.vae.encode(
- masked_image).latent_dist.sample(generator=generator)
+ masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -331,71 +323,62 @@ def prepare_mask_latents(
f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
" Make sure the number of images that you pass is divisible by the total requested batch size."
)
- masked_image_latents = masked_image_latents.tile(
- [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+ masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
- mask = paddle.concat([mask] *
- 2) if do_classifier_free_guidance else mask
- masked_image_latents = (paddle.concat([masked_image_latents] * 2)
- if do_classifier_free_guidance else
- masked_image_latents)
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
# aligning device to prevent device errors when concating it with the latent model input
masked_image_latents = masked_image_latents.cast(dtype)
return mask, masked_image_latents
- def _encode_image(self, image, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
dtype = self.image_encoder.dtype
if not isinstance(image, paddle.Tensor):
- image = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
image = image.cast(dtype)
- image_embeddings, negative_prompt_embeds = self.image_encoder(
- image, return_uncond_vector=True)
+ image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
- image_embeddings = image_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, image_embeddings.shape[0], 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, 1, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, image_embeddings.shape[0], 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([bs_embed * num_images_per_prompt, 1, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- image_embeddings = paddle.concat(
- [negative_prompt_embeds, image_embeddings])
+ image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
return image_embeddings
@paddle.no_grad()
def __call__(
- self,
- example_image: Union[paddle.Tensor, PIL.Image.Image],
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=5.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ example_image: Union[paddle.Tensor, PIL.Image.Image],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 5.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -477,8 +460,7 @@ def __call__(
self.check_inputs(example_image, height, width, callback_steps)
# 4. Encode input image
- image_embeddings = self._encode_image(
- example_image, num_images_per_prompt, do_classifier_free_guidance)
+ image_embeddings = self._encode_image(example_image, num_images_per_prompt, do_classifier_free_guidance)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -493,7 +475,8 @@ def __call__(
width,
image_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Prepare mask latent variables
mask, masked_image_latents = self.prepare_mask_latents(
@@ -504,60 +487,50 @@ def __call__(
width,
image_embeddings.dtype,
generator,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# 8. Check that sizes of mask, masked image and latents match
num_channels_mask = mask.shape[1]
num_channels_masked_image = masked_image_latents.shape[1]
- if (num_channels_latents + num_channels_mask + num_channels_masked_image
- != self.unet.config.in_channels):
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
raise ValueError(
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
- " `pipeline.unet` or your `mask_image` or `image` input.")
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
# 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 10. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# concat latents, mask, masked_image_latents in the channel dimension
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- latent_model_input = paddle.concat(
- [latent_model_input, masked_image_latents, mask], axis=1)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ latent_model_input = paddle.concat([latent_model_input, masked_image_latents, mask], axis=1)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=image_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# must cast this, paddle.concat has bug...
latents = latents.cast(image_embeddings.dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -566,8 +539,7 @@ def __call__(
image = self.decode_latents(latents)
# 12. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, image_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
# 13. Convert to PIL
if output_type == "pil":
@@ -576,5 +548,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
index 9c25c86f78f6a..b51612c302879 100644
--- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
@@ -30,9 +30,15 @@
import numpy as np
import PIL
import PIL.Image
-from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_url,
- model_info, repo_type_and_id_from_hf_id,
- snapshot_download, upload_folder)
+from huggingface_hub import (
+ create_repo,
+ get_hf_file_metadata,
+ hf_hub_url,
+ model_info,
+ repo_type_and_id_from_hf_id,
+ snapshot_download,
+ upload_folder,
+)
from huggingface_hub.utils import EntryNotFoundError
from packaging import version
from tqdm.auto import tqdm
@@ -40,13 +46,31 @@
from ..configuration_utils import ConfigMixin
from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
from ..utils import (
- CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, FLAX_WEIGHTS_NAME,
- FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT,
- ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PPDIFFUSERS_CACHE,
- TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME,
- BaseOutput, deprecate, get_class_from_dynamic_module, is_paddle_available,
- is_paddlenlp_available, is_safetensors_available, logging, numpy_to_pil,
- ppdiffusers_bos_dir_download, ppdiffusers_url_download)
+ CONFIG_NAME,
+ DEPRECATED_REVISION_ARGS,
+ DIFFUSERS_CACHE,
+ FLAX_WEIGHTS_NAME,
+ FROM_DIFFUSERS,
+ FROM_HF_HUB,
+ HF_HUB_OFFLINE,
+ LOW_CPU_MEM_USAGE_DEFAULT,
+ ONNX_EXTERNAL_WEIGHTS_NAME,
+ ONNX_WEIGHTS_NAME,
+ PPDIFFUSERS_CACHE,
+ TO_DIFFUSERS,
+ TORCH_SAFETENSORS_WEIGHTS_NAME,
+ TORCH_WEIGHTS_NAME,
+ BaseOutput,
+ deprecate,
+ get_class_from_dynamic_module,
+ is_paddle_available,
+ is_paddlenlp_available,
+ is_safetensors_available,
+ logging,
+ numpy_to_pil,
+ ppdiffusers_bos_dir_download,
+ ppdiffusers_url_download,
+)
from ..version import VERSION as __version__
if is_paddle_available():
@@ -133,8 +157,7 @@ class AudioPipelineOutput(BaseOutput):
audios: np.ndarray
-def is_safetensors_compatible(filenames, variant=None,
- passed_components=None) -> bool:
+def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
"""
Checking for safetensors compatibility:
- By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
@@ -154,8 +177,7 @@ def is_safetensors_compatible(filenames, variant=None,
for filename in filenames:
_, extension = os.path.splitext(filename)
- if (len(filename.split("/")) == 2 and
- filename.split("/")[0] in passed_components):
+ if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
continue
if extension == ".bin":
@@ -183,8 +205,7 @@ def is_safetensors_compatible(filenames, variant=None,
return True
-def variant_compatible_siblings(filenames,
- variant=None) -> Union[List[os.PathLike], str]:
+def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
weight_names = [
TORCH_WEIGHTS_NAME,
TORCH_SAFETENSORS_WEIGHTS_NAME,
@@ -217,35 +238,17 @@ def variant_compatible_siblings(filenames,
rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
)
# `text_encoder/pytorch_model.bin.index.json`
- non_variant_index_re = re.compile(
- rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json"
- )
+ non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
if variant is not None:
- variant_weights = {
- f
- for f in filenames
- if variant_file_re.match(f.split("/")[-1]) is not None
- }
- variant_indexes = {
- f
- for f in filenames
- if variant_index_re.match(f.split("/")[-1]) is not None
- }
+ variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
+ variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
variant_filenames = variant_weights | variant_indexes
else:
variant_filenames = set()
- non_variant_weights = {
- f
- for f in filenames
- if non_variant_file_re.match(f.split("/")[-1]) is not None
- }
- non_variant_indexes = {
- f
- for f in filenames
- if non_variant_index_re.match(f.split("/")[-1]) is not None
- }
+ non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
+ non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
non_variant_filenames = non_variant_weights | non_variant_indexes
# all variant filenames will be used by default
@@ -254,12 +257,10 @@ def variant_compatible_siblings(filenames,
def convert_to_variant(filename):
if "index" in filename:
variant_filename = filename.replace("index", f"index.{variant}")
- elif (re.compile(f"^(.*?){transformers_index_format}").match(filename)
- is not None):
+ elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
else:
- variant_filename = (
- f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}")
+ variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
return variant_filename
for f in non_variant_filenames:
@@ -270,51 +271,46 @@ def convert_to_variant(filename):
return usable_filenames, variant_filenames
-def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token,
- variant, revision, model_filenames):
+def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token, variant, revision, model_filenames):
info = model_info(
pretrained_model_name_or_path,
use_auth_token=use_auth_token,
- revision=None, )
+ revision=None,
+ )
filenames = {sibling.rfilename for sibling in info.siblings}
- comp_model_filenames, _ = variant_compatible_siblings(
- filenames, variant=revision)
- comp_model_filenames = [
- ".".join(f.split(".")[:1] + f.split(".")[2:])
- for f in comp_model_filenames
- ]
+ comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
+ comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
if set(comp_model_filenames) == set(model_filenames):
warnings.warn(
f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
- FutureWarning, )
+ FutureWarning,
+ )
else:
warnings.warn(
f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
- FutureWarning, )
+ FutureWarning,
+ )
def maybe_raise_or_warn(
- library_name,
- library,
- class_name,
- importable_classes,
- passed_class_obj,
- name,
- is_pipeline_module, ):
+ library_name,
+ library,
+ class_name,
+ importable_classes,
+ passed_class_obj,
+ name,
+ is_pipeline_module,
+):
"""Simple helper method to raise or warn in case incorrect module has been passed"""
if not is_pipeline_module:
library = importlib.import_module(library_name)
class_obj = getattr(library, class_name)
- class_candidates = {
- c: getattr(library, c, None)
- for c in importable_classes.keys()
- }
+ class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
expected_class_obj = None
for class_name, class_candidate in class_candidates.items():
- if class_candidate is not None and issubclass(class_obj,
- class_candidate):
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
expected_class_obj = class_candidate
# Dynamo wraps the original model in a private class.
@@ -325,15 +321,16 @@ def maybe_raise_or_warn(
if not issubclass(model_cls, expected_class_obj):
raise ValueError(
f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
- f" {expected_class_obj}")
+ f" {expected_class_obj}"
+ )
else:
logger.warning(
f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
- " has the correct type")
+ " has the correct type"
+ )
-def get_class_obj_and_candidates(library_name, class_name, importable_classes,
- pipelines, is_pipeline_module):
+def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module):
"""Simple helper method to retrieve class object of module as well as potential parent class objects"""
if is_pipeline_module:
pipeline_module = getattr(pipelines, library_name)
@@ -344,19 +341,12 @@ def get_class_obj_and_candidates(library_name, class_name, importable_classes,
# else we just import it from the library.
library = importlib.import_module(library_name)
class_obj = getattr(library, class_name)
- class_candidates = {
- c: getattr(library, c, None)
- for c in importable_classes.keys()
- }
+ class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
return class_obj, class_candidates
-def _get_pipeline_class(class_obj,
- config,
- custom_pipeline=None,
- cache_dir=None,
- revision=None):
+def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None, revision=None):
if custom_pipeline is not None:
if custom_pipeline.endswith(".py"):
path = Path(custom_pipeline)
@@ -370,31 +360,32 @@ def _get_pipeline_class(class_obj,
custom_pipeline,
module_file=file_name,
cache_dir=cache_dir,
- revision=revision, )
+ revision=revision,
+ )
if class_obj != DiffusionPipeline:
return class_obj
- ppdiffusers_module = importlib.import_module(
- class_obj.__module__.split(".")[0])
+ ppdiffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
return getattr(ppdiffusers_module, config["_class_name"])
def load_sub_model(
- library_name: str,
- class_name: str,
- importable_classes: List[Any],
- pipelines: Any,
- is_pipeline_module: bool,
- pipeline_class: Any,
- paddle_dtype: paddle.dtype,
- runtime_options: Any,
- model_variants: Dict[str, str],
- name: str,
- from_diffusers: bool,
- low_cpu_mem_usage: bool=False,
- cached_folder: Union[str, os.PathLike]=None,
- **kwargs, ):
+ library_name: str,
+ class_name: str,
+ importable_classes: List[Any],
+ pipelines: Any,
+ is_pipeline_module: bool,
+ pipeline_class: Any,
+ paddle_dtype: paddle.dtype,
+ runtime_options: Any,
+ model_variants: Dict[str, str],
+ name: str,
+ from_diffusers: bool,
+ low_cpu_mem_usage: bool = False,
+ cached_folder: Union[str, os.PathLike] = None,
+ **kwargs,
+):
# support huggingface diffusers onnx model
is_onnx_model = False
if "Onnx" in class_name:
@@ -403,29 +394,29 @@ def load_sub_model(
"""Helper method to load the module `name` from `library_name` and `class_name`"""
# retrieve class candidates
class_obj, class_candidates = get_class_obj_and_candidates(
- library_name, class_name, importable_classes, pipelines,
- is_pipeline_module)
+ library_name, class_name, importable_classes, pipelines, is_pipeline_module
+ )
load_method_name = None
# retrive load method name
for class_name, class_candidate in class_candidates.items():
- if class_candidate is not None and issubclass(class_obj,
- class_candidate):
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
load_method_name = importable_classes[class_name][1]
# if load method name is None, then we have a dummy module -> raise Error
if load_method_name is None:
none_module = class_obj.__module__
- is_dummy_path = none_module.startswith(
- DUMMY_MODULES_FOLDER) or none_module.startswith(
- PADDLENLP_DUMMY_MODULES_FOLDER)
+ is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+ PADDLENLP_DUMMY_MODULES_FOLDER
+ )
if is_dummy_path and "dummy" in none_module:
# call class_obj for nice error message of missing requirements
class_obj()
raise ValueError(
f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
- f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}.")
+ f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+ )
load_method = getattr(class_obj, load_method_name)
@@ -435,17 +426,17 @@ def load_sub_model(
# FastDeploy Model
if issubclass(class_obj, FastDeployRuntimeModel):
loading_kwargs["runtime_options"] = (
- runtime_options.get(name, None)
- if isinstance(runtime_options, dict) else runtime_options)
+ runtime_options.get(name, None) if isinstance(runtime_options, dict) else runtime_options
+ )
if not is_onnx_model:
if os.path.isdir(os.path.join(cached_folder, name)):
is_onnx_model = any(
- d.endswith(".onnx") or d.endswith(".pb")
- for d in os.listdir(os.path.join(cached_folder, name)))
+ d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder, name))
+ )
else:
is_onnx_model = any(
- d.endswith(".onnx") or d.endswith(".pb")
- for d in os.listdir(os.path.join(cached_folder)))
+ d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder))
+ )
loading_kwargs["is_onnx_model"] = is_onnx_model
from ppdiffusers import ModelMixin
@@ -461,8 +452,7 @@ def load_sub_model(
try:
# check if the module is in a subdirectory
if os.path.isdir(os.path.join(cached_folder, name)):
- loaded_sub_model = load_method(
- os.path.join(cached_folder, name), **loading_kwargs)
+ loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
else:
# else load from the root directory
loaded_sub_model = load_method(cached_folder, **loading_kwargs)
@@ -478,11 +468,10 @@ def load_sub_model(
loaded_sub_model = load_method(
pretrained_model_name_or_path + "/" + name,
cache_dir=cache_dir,
- **loading_kwargs, )
- if loaded_sub_model is None:
- raise ValueError(
- f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} "
+ **loading_kwargs,
)
+ if loaded_sub_model is None:
+ raise ValueError(f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} ")
return loaded_sub_model
@@ -517,19 +506,15 @@ def register_modules(self, **kwargs):
register_dict = {name: (None, None)}
else:
# TODO (junnyu) support paddlenlp.transformers
- if "paddlenlp" in module.__module__.split(
- ".") or "ppnlp_patch_utils" in module.__module__.split(
- "."):
+ if "paddlenlp" in module.__module__.split(".") or "ppnlp_patch_utils" in module.__module__.split("."):
library = "paddlenlp.transformers"
else:
library = module.__module__.split(".")[0]
# check if the module is a pipeline module
- pipeline_dir = (module.__module__.split(".")[-2] if
- len(module.__module__.split(".")) > 2 else None)
+ pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None
path = module.__module__.split(".")
- is_pipeline_module = pipeline_dir in path and hasattr(
- pipelines, pipeline_dir)
+ is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
# if library is not in LOADABLE_CLASSES, then it is a custom module.
# Or if it's a pipeline module, then the module is inside the pipeline
@@ -549,19 +534,20 @@ def register_modules(self, **kwargs):
setattr(self, name, module)
# TODO junnyu, before register model, we may need to keep some module in fp32
- if (isinstance(module, nn.Layer) and
- hasattr(module, "_keep_in_fp32_modules") and
- module.dtype == paddle.float16 and
- module._keep_in_fp32_modules is not None):
- for module_name, sub_module in module.named_sublayers(
- include_self=True):
- if any(n in module_name
- for n in module._keep_in_fp32_modules):
+ if (
+ isinstance(module, nn.Layer)
+ and hasattr(module, "_keep_in_fp32_modules")
+ and module.dtype == paddle.float16
+ and module._keep_in_fp32_modules is not None
+ ):
+ for module_name, sub_module in module.named_sublayers(include_self=True):
+ if any(n in module_name for n in module._keep_in_fp32_modules):
sub_module.to(dtype=paddle.float32)
if hasattr(sub_module, "pre_hook"):
sub_module.pre_hook.remove()
sub_module.pre_hook = sub_module.register_forward_pre_hook(
- lambda layer, input: input[0].cast("float32"))
+ lambda layer, input: input[0].cast("float32")
+ )
def __setattr__(self, name: str, value: Any):
if name in self.__dict__ and hasattr(self.config, name):
@@ -570,7 +556,8 @@ def __setattr__(self, name: str, value: Any):
if value is not None and self.config[name][0] is not None:
class_library_tuple = (
value.__module__.split(".")[0],
- value.__class__.__name__, )
+ value.__class__.__name__,
+ )
else:
class_library_tuple = (None, None)
@@ -581,11 +568,12 @@ def __setattr__(self, name: str, value: Any):
super().__setattr__(name, value)
def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- safe_serialization: bool=False,
- variant: Optional[str]=None,
- to_diffusers: bool=None, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ to_diffusers: bool = None,
+ ):
"""
Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
@@ -619,10 +607,7 @@ def is_saveable_module(name, value):
return False
return True
- model_index_dict = {
- k: v
- for k, v in model_index_dict.items() if is_saveable_module(k, v)
- }
+ model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
for pipeline_component_name in model_index_dict.keys():
sub_model = getattr(self, pipeline_component_name)
@@ -639,8 +624,7 @@ def is_saveable_module(name, value):
)
for base_class, save_load_methods in library_classes.items():
class_candidate = getattr(library, base_class, None)
- if class_candidate is not None and issubclass(
- model_cls, class_candidate):
+ if class_candidate is not None and issubclass(model_cls, class_candidate):
# if we found a suitable base class in LOADABLE_CLASSES then grab its save method
save_method_name = save_load_methods[0]
break
@@ -648,23 +632,18 @@ def is_saveable_module(name, value):
break
if save_method_name is None:
- logger.warn(
- f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved."
- )
+ logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.")
# make sure that unsaveable components are not tried to be loaded afterward
- self.register_to_config(
- **{pipeline_component_name: (None, None)})
+ self.register_to_config(**{pipeline_component_name: (None, None)})
continue
save_method = getattr(sub_model, save_method_name)
# Call the save method with the argument safe_serialization only if it's supported
save_method_signature = inspect.signature(save_method)
- save_method_accept_safe = (
- "safe_serialization" in save_method_signature.parameters)
+ save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
save_method_accept_variant = "variant" in save_method_signature.parameters
- save_method_accept_to_diffusers = (
- "to_diffusers" in save_method_signature.parameters)
+ save_method_accept_to_diffusers = "to_diffusers" in save_method_signature.parameters
save_kwargs = {}
# maybe we donot have torch so we use safe_serialization
@@ -678,20 +657,19 @@ def is_saveable_module(name, value):
if save_method_accept_to_diffusers:
save_kwargs["to_diffusers"] = to_diffusers
- save_method(
- os.path.join(save_directory, pipeline_component_name),
- **save_kwargs)
+ save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
# finally save the config
self.save_config(save_directory, to_diffusers=to_diffusers)
def save_to_hf_hub(
- self,
- repo_id: str,
- private: Optional[bool]=None,
- commit_message: Optional[str]=None,
- revision: Optional[str]=None,
- create_pr: bool=False, ):
+ self,
+ repo_id: str,
+ private: Optional[bool] = None,
+ commit_message: Optional[str] = None,
+ revision: Optional[str] = None,
+ create_pr: bool = False,
+ ):
"""
Uploads all elements of this pipeline to a new HuggingFace Hub repository.
Args:
@@ -715,9 +693,7 @@ def save_to_hf_hub(
# Check if README file already exist in repo
try:
- get_hf_file_metadata(
- hf_hub_url(
- repo_id=repo_id, filename="README.md", revision=revision))
+ get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
has_readme = True
except EntryNotFoundError:
has_readme = False
@@ -739,13 +715,15 @@ def save_to_hf_hub(
folder_path=tmp_dir,
commit_message=commit_message,
revision=revision,
- create_pr=create_pr, )
+ create_pr=create_pr,
+ )
def to(
- self,
- paddle_device: Optional[str]=None,
- paddle_dtype: Optional[paddle.dtype]=None,
- silence_dtype_warnings: bool=True, ):
+ self,
+ paddle_device: Optional[str] = None,
+ paddle_dtype: Optional[paddle.dtype] = None,
+ silence_dtype_warnings: bool = True,
+ ):
if paddle_device is None and paddle_dtype is None:
return self
@@ -753,9 +731,12 @@ def to(
modules = [getattr(self, n, None) for n in module_names]
modules = [m for m in modules if isinstance(m, nn.Layer)]
for module in modules:
- if (paddle_device is not None and module.dtype == paddle.float16 and
- str(paddle_device) in ["cpu"] and
- not silence_dtype_warnings):
+ if (
+ paddle_device is not None
+ and module.dtype == paddle.float16
+ and str(paddle_device) in ["cpu"]
+ and not silence_dtype_warnings
+ ):
logger.warning(
"Pipelines loaded with `paddle_dtype=paddle.float16` cannot run with `cpu` device. It"
" is not recommended to move them to `cpu` as running them will fail. Please make"
@@ -771,19 +752,20 @@ def to(
module.to(**kwargs)
# TODO junnyu, before register model, we may need to keep some module in fp32
- if (isinstance(module, nn.Layer) and
- hasattr(module, "_keep_in_fp32_modules") and
- module.dtype == paddle.float16 and
- module._keep_in_fp32_modules is not None):
- for module_name, sub_module in module.named_sublayers(
- include_self=True):
- if any(n in module_name
- for n in module._keep_in_fp32_modules):
+ if (
+ isinstance(module, nn.Layer)
+ and hasattr(module, "_keep_in_fp32_modules")
+ and module.dtype == paddle.float16
+ and module._keep_in_fp32_modules is not None
+ ):
+ for module_name, sub_module in module.named_sublayers(include_self=True):
+ if any(n in module_name for n in module._keep_in_fp32_modules):
sub_module.to(dtype=paddle.float32)
if hasattr(sub_module, "pre_hook"):
sub_module.pre_hook.remove()
sub_module.pre_hook = sub_module.register_forward_pre_hook(
- lambda layer, input: input[0].cast("float32"))
+ lambda layer, input: input[0].cast("float32")
+ )
return self
@property
@@ -801,10 +783,7 @@ def device(self):
return "cpu"
@classmethod
- def from_pretrained(
- cls,
- pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
- **kwargs):
+ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
r"""
Instantiate a Paddle diffusion pipeline from pre-trained pipeline weights.
@@ -964,18 +943,17 @@ def from_pretrained(
custom_pipeline = kwargs.pop("custom_pipeline", None)
custom_revision = kwargs.pop("custom_revision", None)
runtime_options = kwargs.pop("runtime_options", None)
- low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
- LOW_CPU_MEM_USAGE_DEFAULT)
- use_safetensors = kwargs.pop("use_safetensors", None
- if is_safetensors_available() else False)
+ low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
+ use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
variant = kwargs.pop("variant", None)
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
# deperate
return_cached_folder = kwargs.pop("return_cached_folder", False)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
load_sub_model_kwargs = {
"pretrained_model_name_or_path": pretrained_model_name_or_path,
@@ -1003,7 +981,8 @@ def from_pretrained(
variant=variant,
from_hf_hub=from_hf_hub,
from_diffusers=from_diffusers,
- **kwargs, )
+ **kwargs,
+ )
else:
# is_local_dir
load_sub_model_kwargs["is_local_dir"] = True
@@ -1023,8 +1002,8 @@ def from_pretrained(
folder_path = os.path.join(cached_folder, folder)
is_folder = os.path.isdir(folder_path) and folder in config_dict
variant_exists = is_folder and any(
- p.split(".")[1].startswith(variant)
- for p in os.listdir(folder_path))
+ p.split(".")[1].startswith(variant) for p in os.listdir(folder_path)
+ )
if variant_exists:
model_variants[folder] = variant
@@ -1035,18 +1014,22 @@ def from_pretrained(
config_dict,
custom_pipeline=custom_pipeline,
cache_dir=cache_dir,
- revision=custom_revision, )
+ revision=custom_revision,
+ )
# DEPRECATED: To be removed in 1.0.0
- _ppdiffusers_version = (config_dict["_diffusers_paddle_version"]
- if "_diffusers_paddle_version" in config_dict
- else config_dict["_ppdiffusers_version"])
- if (pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and
- version.parse(
- version.parse(_ppdiffusers_version).base_version) <=
- version.parse("0.5.1")):
- from ppdiffusers import (StableDiffusionInpaintPipeline,
- StableDiffusionInpaintPipelineLegacy)
+ _ppdiffusers_version = (
+ config_dict["_diffusers_paddle_version"]
+ if "_diffusers_paddle_version" in config_dict
+ else config_dict["_ppdiffusers_version"]
+ )
+ if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+ version.parse(_ppdiffusers_version).base_version
+ ) <= version.parse("0.5.1"):
+ from ppdiffusers import (
+ StableDiffusionInpaintPipeline,
+ StableDiffusionInpaintPipelineLegacy,
+ )
pipeline_class = StableDiffusionInpaintPipelineLegacy
@@ -1063,7 +1046,8 @@ def from_pretrained(
"StableDiffusionInpaintPipelineLegacy",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
# 4. Define expected modules given pipeline signature
# and define non-None initialized modules (=`init_kwargs`)
@@ -1071,26 +1055,15 @@ def from_pretrained(
# some modules can be passed directly to the init
# in this case they are already instantiated in `kwargs`
# extract them here
- expected_modules, optional_kwargs = cls._get_signature_keys(
- pipeline_class)
- passed_class_obj = {
- k: kwargs.pop(k)
- for k in expected_modules if k in kwargs
- }
- passed_pipe_kwargs = {
- k: kwargs.pop(k)
- for k in optional_kwargs if k in kwargs
- }
+ expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+ passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+ passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
- init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(
- config_dict, **kwargs)
+ init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
# define init kwargs
- init_kwargs = {
- k: init_dict.pop(k)
- for k in optional_kwargs if k in init_dict
- }
- init_kwargs = { ** init_kwargs, ** passed_pipe_kwargs}
+ init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+ init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
# remove `null` components
def load_module(name, value):
@@ -1127,8 +1100,7 @@ def load_module(name, value):
# 6.2 Define all importable classes
is_pipeline_module = hasattr(pipelines, library_name)
- importable_classes = (ALL_IMPORTABLE_CLASSES if is_pipeline_module
- else LOADABLE_CLASSES[library_name])
+ importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name]
loaded_sub_model = None
# 6.3 Use passed sub model or load class_name from library_name
@@ -1144,7 +1116,8 @@ def load_module(name, value):
importable_classes,
passed_class_obj,
name,
- is_pipeline_module, )
+ is_pipeline_module,
+ )
loaded_sub_model = passed_class_obj[name]
else:
@@ -1164,23 +1137,20 @@ def load_module(name, value):
variant=variant,
low_cpu_mem_usage=low_cpu_mem_usage,
cached_folder=cached_folder,
- **load_sub_model_kwargs, )
+ **load_sub_model_kwargs,
+ )
- init_kwargs[
- name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...)
+ init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...)
# 7. Potentially add passed objects if expected
missing_modules = set(expected_modules) - set(init_kwargs.keys())
passed_modules = list(passed_class_obj.keys())
optional_modules = pipeline_class._optional_components
- if len(missing_modules) > 0 and missing_modules <= set(
- passed_modules + optional_modules):
+ if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
for module in missing_modules:
init_kwargs[module] = passed_class_obj.get(module, None)
elif len(missing_modules) > 0:
- passed_modules = (
- set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) -
- optional_kwargs)
+ passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
raise ValueError(
f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
)
@@ -1195,8 +1165,7 @@ def load_module(name, value):
for _submodule in _module:
if isinstance(_submodule, nn.Layer):
_submodule.eval()
- if (paddle_dtype is not None and
- _submodule.dtype != paddle_dtype):
+ if paddle_dtype is not None and _submodule.dtype != paddle_dtype:
_submodule.to(dtype=paddle_dtype)
# 9. Instantiate the pipeline
@@ -1210,8 +1179,7 @@ def load_module(name, value):
return model
@classmethod
- def download(cls, pretrained_model_name,
- **kwargs) -> Union[str, os.PathLike]:
+ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
r"""
Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights.
Parameters:
@@ -1284,8 +1252,9 @@ def download(cls, pretrained_model_name,
"""
from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
- cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
- else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+ cache_dir = (
+ kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+ )
from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
resume_download = kwargs.pop("resume_download", False)
force_download = kwargs.pop("force_download", False)
@@ -1299,8 +1268,7 @@ def download(cls, pretrained_model_name,
use_safetensors = kwargs.pop("use_safetensors", None)
max_workers = int(kwargs.pop("max_workers", 1))
- if from_diffusers and use_safetensors and not is_safetensors_available(
- ):
+ if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
)
@@ -1324,14 +1292,14 @@ def download(cls, pretrained_model_name,
use_auth_token=use_auth_token,
revision=revision,
from_hf_hub=from_hf_hub,
- return_config_file=True, )
+ return_config_file=True,
+ )
ignore_filenames = config_dict.pop("_ignore_files", [])
# is_fastdeploy_model we wont use safetensors
if cls == DiffusionPipeline:
- is_fastdeploy_model = (
- "fastdeploy" in config_dict.get("_class_name", "").lower())
+ is_fastdeploy_model = "fastdeploy" in config_dict.get("_class_name", "").lower()
else:
is_fastdeploy_model = "fastdeploy" in cls.__name__.lower()
if is_fastdeploy_model:
@@ -1354,46 +1322,38 @@ def download(cls, pretrained_model_name,
info = model_info(
pretrained_model_name,
use_auth_token=use_auth_token,
- revision=revision, )
+ revision=revision,
+ )
filenames = {sibling.rfilename for sibling in info.siblings}
- model_filenames, variant_filenames = variant_compatible_siblings(
- filenames, variant=variant)
+ model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
# remove ignored filenames
model_filenames = set(model_filenames) - set(ignore_filenames)
- variant_filenames = set(variant_filenames) - set(
- ignore_filenames)
+ variant_filenames = set(variant_filenames) - set(ignore_filenames)
# if the whole pipeline is cached we don't have to ping the Hub
if revision in DEPRECATED_REVISION_ARGS and version.parse(
- version.parse(__version__)
- .base_version) >= version.parse("0.17.0"):
+ version.parse(__version__).base_version
+ ) >= version.parse("0.17.0"):
warn_deprecated_model_variant(
pretrained_model_name,
use_auth_token,
variant,
revision,
- model_filenames, )
+ model_filenames,
+ )
- model_folder_names = {
- os.path.split(f)[0]
- for f in model_filenames
- }
+ model_folder_names = {os.path.split(f)[0] for f in model_filenames}
# all filenames compatible with variant will be added
allow_patterns = list(model_filenames)
# allow all patterns from non-model folders
# this enables downloading schedulers, tokenizers, ...
- allow_patterns += [
- os.path.join(k, "*") for k in folder_names
- if k not in model_folder_names
- ]
+ allow_patterns += [os.path.join(k, "*") for k in folder_names if k not in model_folder_names]
# also allow downloading config.json files with the model
- allow_patterns += [
- os.path.join(k, "config.json") for k in model_folder_names
- ]
+ allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
allow_patterns += [
SCHEDULER_CONFIG_NAME,
@@ -1408,24 +1368,28 @@ def download(cls, pretrained_model_name,
config_dict,
custom_pipeline=custom_pipeline,
cache_dir=cache_dir,
- revision=custom_revision, )
+ revision=custom_revision,
+ )
expected_components, _ = cls._get_signature_keys(pipeline_class)
- passed_components = [
- k for k in expected_components if k in kwargs
- ]
+ passed_components = [k for k in expected_components if k in kwargs]
- if (use_safetensors and not allow_pickle and
- not is_safetensors_compatible(
- model_filenames,
- variant=variant,
- passed_components=passed_components, )):
+ if (
+ use_safetensors
+ and not allow_pickle
+ and not is_safetensors_compatible(
+ model_filenames,
+ variant=variant,
+ passed_components=passed_components,
+ )
+ ):
raise EnvironmentError(
f"Could not found the necessary `safetensors` weights in {model_filenames} (variant={variant})"
)
elif use_safetensors and is_safetensors_compatible(
- model_filenames,
- variant=variant,
- passed_components=passed_components, ):
+ model_filenames,
+ variant=variant,
+ passed_components=passed_components,
+ ):
ignore_patterns = [
"*.msgpack",
"*.bin",
@@ -1434,79 +1398,50 @@ def download(cls, pretrained_model_name,
"*.pdmodel",
]
- safetensors_variant_filenames = {
- f
- for f in variant_filenames if f.endswith(".safetensors")
- }
- safetensors_model_filenames = {
- f
- for f in model_filenames if f.endswith(".safetensors")
- }
- if (len(safetensors_variant_filenames) > 0 and
- safetensors_model_filenames !=
- safetensors_variant_filenames):
+ safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")}
+ safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")}
+ if (
+ len(safetensors_variant_filenames) > 0
+ and safetensors_model_filenames != safetensors_variant_filenames
+ ):
logger.warn(
f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
)
else:
ignore_patterns = ["*.safetensors", "*.msgpack"]
if from_diffusers:
- ignore_patterns.extend(
- ["*.pdparams", "*.pdiparams", "*.pdmodel"])
+ ignore_patterns.extend(["*.pdparams", "*.pdiparams", "*.pdmodel"])
suffix = ".bin"
else:
if is_fastdeploy_model:
ignore_patterns.extend(["*.pdparams", "*.bin"])
suffix = ".pdmodel"
else:
- ignore_patterns.extend(
- ["*.pdiparams", "*.pdmodel", "*.bin"])
+ ignore_patterns.extend(["*.pdiparams", "*.pdmodel", "*.bin"])
suffix = ".pdparams"
- bin_variant_filenames = {
- f
- for f in variant_filenames if f.endswith(suffix)
- }
- bin_model_filenames = {
- f
- for f in model_filenames if f.endswith(suffix)
- }
- if (len(bin_variant_filenames) > 0 and
- bin_model_filenames != bin_variant_filenames):
+ bin_variant_filenames = {f for f in variant_filenames if f.endswith(suffix)}
+ bin_model_filenames = {f for f in model_filenames if f.endswith(suffix)}
+ if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames:
logger.warn(
f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
)
# Don't download any objects that are passed
allow_patterns = [
- p for p in allow_patterns
- if not (len(p.split("/")) == 2 and p.split("/")[0] in
- passed_components)
+ p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
]
# Don't download index files of forbidden patterns either
- ignore_patterns = ignore_patterns + [
- f"{i}.index.*json" for i in ignore_patterns
- ]
+ ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
- re_ignore_pattern = [
- re.compile(fnmatch.translate(p)) for p in ignore_patterns
- ]
- re_allow_pattern = [
- re.compile(fnmatch.translate(p)) for p in allow_patterns
- ]
+ re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
+ re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]
- expected_files = [
- f for f in filenames
- if not any(p.match(f) for p in re_ignore_pattern)
- ]
- expected_files = [
- f for f in expected_files
- if any(p.match(f) for p in re_allow_pattern)
- ]
+ expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)]
+ expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)]
snapshot_folder = Path(config_file).parent
- pipeline_is_cached = all((snapshot_folder / f).is_file()
- for f in expected_files)
+ pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
if pipeline_is_cached:
# if the pipeline is cached, we can directly return it
@@ -1514,8 +1449,7 @@ def download(cls, pretrained_model_name,
return snapshot_folder
user_agent = {"pipeline_class": cls.__name__}
- if custom_pipeline is not None and not custom_pipeline.endswith(
- ".py"):
+ if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
user_agent["custom_pipeline"] = custom_pipeline
# download all allow_patterns - ignore_patterns
@@ -1528,13 +1462,13 @@ def download(cls, pretrained_model_name,
local_files_only=local_files_only,
use_auth_token=use_auth_token,
revision=revision,
- allow_patterns=list(
- set(allow_patterns) - set(ignore_filenames)),
+ allow_patterns=list(set(allow_patterns) - set(ignore_filenames)),
ignore_patterns=list(
set(ignore_patterns + ignore_filenames)
), # diffusers bug, so we must add this ignore_filenames!
user_agent=user_agent,
- max_workers=max_workers, )
+ max_workers=max_workers,
+ )
else:
# only support [PD] .pdparams, fastdeploy model
cached_folder = ppdiffusers_bos_dir_download(
@@ -1547,17 +1481,16 @@ def download(cls, pretrained_model_name,
variant=variant,
is_fastdeploy_model=is_fastdeploy_model,
local_files_only=local_files_only,
- max_workers=max_workers, )
+ max_workers=max_workers,
+ )
return cached_folder
@classmethod
- def from_pretrained_original_ckpt(
- cls,
- pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
- **kwargs):
- from .stable_diffusion.convert_from_ckpt_deprecated import \
- load_pipeline_from_original_stable_diffusion_ckpt
+ def from_pretrained_original_ckpt(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+ from .stable_diffusion.convert_from_ckpt_deprecated import (
+ load_pipeline_from_original_stable_diffusion_ckpt,
+ )
resume_download = kwargs.pop("resume_download", False)
force_download = kwargs.pop("force_download", False)
@@ -1568,37 +1501,33 @@ def from_pretrained_original_ckpt(
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
if os.path.isfile(pretrained_model_name_or_path):
checkpoint_path = pretrained_model_name_or_path
- elif pretrained_model_name_or_path.startswith(
- "http://") or pretrained_model_name_or_path.startswith(
- "https://"):
+ elif pretrained_model_name_or_path.startswith("http://") or pretrained_model_name_or_path.startswith(
+ "https://"
+ ):
checkpoint_path = ppdiffusers_url_download(
pretrained_model_name_or_path,
cache_dir=cache_dir,
resume_download=resume_download,
- force_download=force_download, )
+ force_download=force_download,
+ )
else:
- raise EnvironmentError(
- f"Please check your {pretrained_model_name_or_path}.")
+ raise EnvironmentError(f"Please check your {pretrained_model_name_or_path}.")
pipeline = load_pipeline_from_original_stable_diffusion_ckpt(
checkpoint_path=checkpoint_path,
original_config_file=original_config_file,
paddle_dtype=paddle_dtype,
requires_safety_checker=requires_safety_checker,
cls=cls,
- **kwargs, )
+ **kwargs,
+ )
return pipeline
@staticmethod
def _get_signature_keys(obj):
parameters = inspect.signature(obj.__init__).parameters
- required_parameters = {
- k: v
- for k, v in parameters.items() if v.default == inspect._empty
- }
- optional_parameters = set(
- {k
- for k, v in parameters.items() if v.default != inspect._empty})
+ required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+ optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
expected_modules = set(required_parameters.keys()) - {"self"}
return expected_modules, optional_parameters
@@ -1628,9 +1557,7 @@ def components(self) -> Dict[str, Any]:
"""
expected_modules, optional_parameters = self._get_signature_keys(self)
components = {
- k: getattr(self, k)
- for k in self.config.keys()
- if not k.startswith("_") and k not in optional_parameters
+ k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
}
if set(components.keys()) != expected_modules:
@@ -1666,8 +1593,7 @@ def progress_bar(self, iterable=None, total=None):
def set_progress_bar_config(self, **kwargs):
self._progress_bar_config = kwargs
- def enable_xformers_memory_efficient_attention(
- self, attention_op: Optional[str]=None):
+ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
r"""
Enable memory efficient attention as implemented in xformers.
@@ -1701,15 +1627,13 @@ def disable_xformers_memory_efficient_attention(self):
"""
self.set_use_memory_efficient_attention_xformers(False)
- def set_use_memory_efficient_attention_xformers(
- self, valid: bool, attention_op: Optional[str]=None) -> None:
+ def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
# Recursively walk through all the children.
# Any children which exposes the set_use_memory_efficient_attention_xformers method
# gets the message
def fn_recursive_set_mem_eff(module: nn.Layer):
if hasattr(module, "set_use_memory_efficient_attention_xformers"):
- module.set_use_memory_efficient_attention_xformers(valid,
- attention_op)
+ module.set_use_memory_efficient_attention_xformers(valid, attention_op)
for child in module.children():
fn_recursive_set_mem_eff(child)
@@ -1721,8 +1645,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer):
for module in modules:
fn_recursive_set_mem_eff(module)
- def enable_attention_slicing(self,
- slice_size: Optional[Union[str, int]]="auto"):
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
@@ -1749,10 +1672,7 @@ def disable_attention_slicing(self):
def set_attention_slice(self, slice_size: Optional[int]):
module_names, _ = self._get_signature_keys(self)
modules = [getattr(self, n, None) for n in module_names]
- modules = [
- m for m in modules
- if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")
- ]
+ modules = [m for m in modules if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")]
for module in modules:
module.set_attention_slice(slice_size)
diff --git a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
index 975204896be93..c946ea77ac787 100644
--- a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
@@ -46,14 +46,14 @@ def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- num_inference_steps: int=50,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 50,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Args:
batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
@@ -80,8 +80,10 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, ),
- generator=generator, )
+ self.unet.config.sample_size,
+ ),
+ generator=generator,
+ )
self.scheduler.set_timesteps(num_inference_steps)
for t in self.progress_bar(self.scheduler.timesteps):
@@ -95,6 +97,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
index a44fac86017af..b0d248fac49cc 100644
--- a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
@@ -38,11 +38,7 @@ def _preprocess_image(image: Union[List, PIL.Image.Image, paddle.Tensor]):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -62,12 +58,7 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, paddle.Tensor]):
if isinstance(mask[0], PIL.Image.Image):
w, h = mask[0].size
w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
- mask = [
- np.array(
- m.convert("L").resize(
- (w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :]
- for m in mask
- ]
+ mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
mask = np.concatenate(mask, axis=0)
mask = mask.astype(np.float32) / 255.0
mask[mask < 0.5] = 0
@@ -88,17 +79,17 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- num_inference_steps: int=250,
- eta: float=0.0,
- jump_length: int=10,
- jump_n_sample: int=10,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ num_inference_steps: int = 250,
+ eta: float = 0.0,
+ jump_length: int = 10,
+ jump_n_sample: int = 10,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Args:
image (`paddle.Tensor` or `PIL.Image.Image`):
@@ -146,12 +137,10 @@ def __call__(
)
image_shape = original_image.shape
- image = randn_tensor(
- image_shape, generator=generator, dtype=self.unet.dtype)
+ image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
# set step values
- self.scheduler.set_timesteps(num_inference_steps, jump_length,
- jump_n_sample)
+ self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample)
self.scheduler.eta = eta
t_last = self.scheduler.timesteps[0] + 1
@@ -161,9 +150,7 @@ def __call__(
# predict the noise residual
model_output = self.unet(image, t).sample
# compute previous image: x_t -> x_t-1
- image = self.scheduler.step(model_output, t, image,
- original_image, mask_image,
- generator).prev_sample
+ image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
else:
# compute the reverse: x_t-1 -> x_t
@@ -176,6 +163,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index e3ce24a7eaf72..4e81855ba00f1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -40,14 +40,14 @@ def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- num_inference_steps: int=2000,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 2000,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -70,25 +70,22 @@ def __call__(
model = self.unet
- sample = (randn_tensor(
- shape, generator=generator) * self.scheduler.init_noise_sigma)
+ sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
self.scheduler.set_timesteps(num_inference_steps)
self.scheduler.set_sigmas(num_inference_steps)
for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
- sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0], ))
+ sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0],))
# correction step
for _ in range(self.scheduler.config.correct_steps):
model_output = self.unet(sample, sigma_t).sample
- sample = self.scheduler.step_correct(
- model_output, sample, generator=generator).prev_sample
+ sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
# prediction step
model_output = model(sample, sigma_t).sample
- output = self.scheduler.step_pred(
- model_output, t, sample, generator=generator)
+ output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
sample, sample_mean = output.prev_sample, output.prev_sample_mean
@@ -98,6 +95,6 @@ def __call__(
sample = self.numpy_to_pil(sample)
if not return_dict:
- return (sample, )
+ return (sample,)
return ImagePipelineOutput(images=sample)
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
index e24cb5eee2eb1..9842e59ad078e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -42,5 +42,4 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):
if is_paddle_available() and is_paddlenlp_available():
- from .pipeline_semantic_stable_diffusion import \
- SemanticStableDiffusionPipeline
+ from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
index b8778c74b1d86..7fd2b4f407754 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
@@ -68,8 +68,7 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
if isinstance(axis, list):
axis_src, axis_dst = [], []
for axis_single in axis:
- if not isinstance(axis_single, int) or not (
- axis_single < dims and axis_single >= -dims):
+ if not isinstance(axis_single, int) or not (axis_single < dims and axis_single >= -dims):
raise ValueError(
"Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
)
@@ -88,17 +87,13 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
axis = axis_dst[0]
else:
if not isinstance(axis, int) or not (axis < dims and axis >= -dims):
- raise ValueError(
- "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
- )
+ raise ValueError("Axis should be None, int, or a list, element should in range [-rank(x), rank(x)).")
if axis < 0:
axis += dims
out_shape[axis] = 1
mask = x.isnan()
- valid_counts = mask.logical_not().sum(axis=axis,
- keepdim=True,
- dtype="float64")
+ valid_counts = mask.logical_not().sum(axis=axis, keepdim=True, dtype="float64")
indices = []
@@ -127,15 +122,14 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
for index in indices:
indices_below = paddle.floor(index).astype(paddle.int32)
indices_upper = paddle.ceil(index).astype(paddle.int32)
- tensor_upper = paddle.take_along_axis(
- sorted_tensor, indices_upper, axis=axis)
- tensor_below = paddle.take_along_axis(
- sorted_tensor, indices_below, axis=axis)
+ tensor_upper = paddle.take_along_axis(sorted_tensor, indices_upper, axis=axis)
+ tensor_below = paddle.take_along_axis(sorted_tensor, indices_below, axis=axis)
weights = index - indices_below.astype("float64")
out = paddle.lerp(
tensor_below.astype("float64"),
tensor_upper.astype("float64"),
- weights, )
+ weights,
+ )
if not keepdim:
out = paddle.squeeze(out, axis=axis)
else:
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index df0e298fe252a..70eaa17e88188 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -18,13 +18,11 @@
from typing import Callable, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...models import AutoencoderKL, UNet2DConditionModel
from ...pipeline_utils import DiffusionPipeline
-from ...pipelines.stable_diffusion.safety_checker import \
- StableDiffusionSafetyChecker
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging, randn_tensor
from . import SemanticStableDiffusionPipelineOutput
@@ -107,15 +105,16 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -141,8 +140,9 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
@@ -161,54 +161,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -221,23 +217,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -253,33 +252,33 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: int=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- editing_prompt: Optional[Union[str, List[str]]]=None,
- editing_prompt_embeddings: Optional[paddle.Tensor]=None,
- reverse_editing_direction: Optional[Union[bool, List[bool]]]=False,
- edit_guidance_scale: Optional[Union[float, List[float]]]=5,
- edit_warmup_steps: Optional[Union[int, List[int]]]=10,
- edit_cooldown_steps: Optional[Union[int, List[int]]]=None,
- edit_threshold: Optional[Union[float, List[float]]]=0.9,
- edit_momentum_scale: Optional[float]=0.1,
- edit_mom_beta: Optional[float]=0.4,
- edit_weights: Optional[List[float]]=None,
- sem_guidance: Optional[List[paddle.Tensor]]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: int = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ editing_prompt: Optional[Union[str, List[str]]] = None,
+ editing_prompt_embeddings: Optional[paddle.Tensor] = None,
+ reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+ edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+ edit_warmup_steps: Optional[Union[int, List[int]]] = 10,
+ edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+ edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+ edit_momentum_scale: Optional[float] = 0.1,
+ edit_mom_beta: Optional[float] = 0.4,
+ edit_weights: Optional[List[float]] = None,
+ sem_guidance: Optional[List[paddle.Tensor]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -399,61 +398,53 @@ def __call__(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
text_embeddings = self.text_encoder(text_input_ids)[0]
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if enable_edit_guidance:
# get safety text embeddings
if editing_prompt_embeddings is None:
edit_concepts_input = self.tokenizer(
- [
- x
- for item in editing_prompt
- for x in repeat(item, batch_size)
- ],
+ [x for item in editing_prompt for x in repeat(item, batch_size)],
padding="max_length",
max_length=self.tokenizer.model_max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
edit_concepts_input_ids = edit_concepts_input.input_ids
- if edit_concepts_input_ids.shape[
- -1] > self.tokenizer.model_max_length:
+ if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
removed_text = self.tokenizer.batch_decode(
- edit_concepts_input_ids[:, self.tokenizer.
- model_max_length:])
+ edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
)
- edit_concepts_input_ids = edit_concepts_input_ids[:, :self.
- tokenizer.
- model_max_length]
+ edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
edit_concepts = self.text_encoder(edit_concepts_input_ids)[0]
else:
- edit_concepts = editing_prompt_embeddings.tile(
- [batch_size, 1, 1])
+ edit_concepts = editing_prompt_embeddings.tile([batch_size, 1, 1])
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
edit_concepts = edit_concepts.tile([1, num_images_per_prompt, 1])
- edit_concepts = edit_concepts.reshape(
- [bs_embed_edit * num_images_per_prompt, seq_len_edit, -1])
+ edit_concepts = edit_concepts.reshape([bs_embed_edit * num_images_per_prompt, seq_len_edit, -1])
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -468,14 +459,16 @@ def __call__(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -485,25 +478,22 @@ def __call__(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [batch_size, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
if enable_edit_guidance:
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings, edit_concepts])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings, edit_concepts])
else:
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
# get the initial random noise unless the user supplied it
# 4. Prepare timesteps
@@ -519,7 +509,8 @@ def __call__(
width,
text_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -534,41 +525,39 @@ def __call__(
for i, t in enumerate(self.progress_bar(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] *
- (2 + enabled_editing_prompts))
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = (
+ paddle.concat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
+ )
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
- noise_pred_out = noise_pred.chunk(
- 2 + enabled_editing_prompts) # [b,4, 64, 64]
+ noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts) # [b,4, 64, 64]
noise_pred_uncond, noise_pred_text = (
noise_pred_out[0],
- noise_pred_out[1], )
+ noise_pred_out[1],
+ )
noise_pred_edit_concepts = noise_pred_out[2:]
# default text guidance
- noise_guidance = guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond)
# noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0])
if self.uncond_estimates is None:
self.uncond_estimates = paddle.zeros(
(num_inference_steps + 1, *noise_pred_uncond.shape),
- dtype=noise_pred.dtype, )
+ dtype=noise_pred.dtype,
+ )
self.uncond_estimates[i] = noise_pred_uncond.detach()
if self.text_estimates is None:
self.text_estimates = paddle.zeros(
(num_inference_steps + 1, *noise_pred_text.shape),
- dtype=noise_pred.dtype, )
+ dtype=noise_pred.dtype,
+ )
self.text_estimates[i] = noise_pred_text.detach()
if self.edit_estimates is None and enable_edit_guidance:
@@ -576,29 +565,32 @@ def __call__(
(
num_inference_steps + 1,
len(noise_pred_edit_concepts),
- *noise_pred_edit_concepts[0].shape, ),
- dtype=noise_pred.dtype, )
+ *noise_pred_edit_concepts[0].shape,
+ ),
+ dtype=noise_pred.dtype,
+ )
if self.sem_guidance is None:
self.sem_guidance = paddle.zeros(
(num_inference_steps + 1, *noise_pred_text.shape),
- dtype=noise_pred.dtype, )
+ dtype=noise_pred.dtype,
+ )
if edit_momentum is None:
edit_momentum = paddle.zeros_like(noise_guidance)
if enable_edit_guidance:
concept_weights = paddle.zeros(
- (len(noise_pred_edit_concepts),
- noise_guidance.shape[0]),
- dtype=noise_guidance.dtype, )
+ (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
+ dtype=noise_guidance.dtype,
+ )
noise_guidance_edit = paddle.zeros(
(len(noise_pred_edit_concepts), *noise_guidance.shape),
- dtype=noise_guidance.dtype, )
+ dtype=noise_guidance.dtype,
+ )
# noise_guidance_edit = torch.zeros_like(noise_guidance)
warmup_inds = []
- for c, noise_pred_edit_concept in enumerate(
- noise_pred_edit_concepts):
+ for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
self.edit_estimates[i, c] = noise_pred_edit_concept
if isinstance(edit_guidance_scale, list):
edit_guidance_scale_c = edit_guidance_scale[c]
@@ -610,8 +602,7 @@ def __call__(
else:
edit_threshold_c = edit_threshold
if isinstance(reverse_editing_direction, list):
- reverse_editing_direction_c = reverse_editing_direction[
- c]
+ reverse_editing_direction_c = reverse_editing_direction[c]
else:
reverse_editing_direction_c = reverse_editing_direction
if edit_weights:
@@ -632,27 +623,19 @@ def __call__(
if i >= edit_warmup_steps_c:
warmup_inds.append(c)
if i >= edit_cooldown_steps_c:
- noise_guidance_edit[
- c, :, :, :, :] = paddle.zeros_like(
- noise_pred_edit_concept)
+ noise_guidance_edit[c, :, :, :, :] = paddle.zeros_like(noise_pred_edit_concept)
continue
- noise_guidance_edit_tmp = (
- noise_pred_edit_concept - noise_pred_uncond)
+ noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
# tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3))
- tmp_weights = (
- noise_guidance - noise_pred_edit_concept).sum(
- (1, 2, 3))
+ tmp_weights = (noise_guidance - noise_pred_edit_concept).sum((1, 2, 3))
- tmp_weights = paddle.full_like(
- tmp_weights,
- edit_weight_c) # * (1 / enabled_editing_prompts)
+ tmp_weights = paddle.full_like(tmp_weights, edit_weight_c) # * (1 / enabled_editing_prompts)
if reverse_editing_direction_c:
noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
concept_weights[c, :] = tmp_weights
- noise_guidance_edit_tmp = (noise_guidance_edit_tmp *
- edit_guidance_scale_c)
+ noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
# quantile function expects float32
if noise_guidance_edit_tmp.dtype == paddle.float32:
@@ -660,23 +643,22 @@ def __call__(
paddle.abs(noise_guidance_edit_tmp).flatten(2),
edit_threshold_c,
axis=2,
- keepdim=False, )
+ keepdim=False,
+ )
else:
tmp = quantile(
- paddle.abs(noise_guidance_edit_tmp).flatten(2)
- .cast(paddle.float32),
+ paddle.abs(noise_guidance_edit_tmp).flatten(2).cast(paddle.float32),
edit_threshold_c,
axis=2,
keepdim=False,
).cast(noise_guidance_edit_tmp.dtype)
noise_guidance_edit_tmp = paddle.where(
- paddle.abs(noise_guidance_edit_tmp) >=
- tmp[:, :, None, None],
+ paddle.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
noise_guidance_edit_tmp,
- paddle.zeros_like(noise_guidance_edit_tmp), )
- noise_guidance_edit[
- c, :, :, :, :] = noise_guidance_edit_tmp
+ paddle.zeros_like(noise_guidance_edit_tmp),
+ )
+ noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
# noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
@@ -685,22 +667,21 @@ def __call__(
# concept_weights = concept_weights.to("cpu") # Offload to cpu
# noise_guidance_edit = noise_guidance_edit.to("cpu")
- concept_weights_tmp = paddle.index_select(
- concept_weights, warmup_inds, 0)
+ concept_weights_tmp = paddle.index_select(concept_weights, warmup_inds, 0)
concept_weights_tmp = paddle.where(
concept_weights_tmp < 0,
paddle.zeros_like(concept_weights_tmp),
- concept_weights_tmp, )
- concept_weights_tmp = (concept_weights_tmp /
- concept_weights_tmp.sum(0))
+ concept_weights_tmp,
+ )
+ concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(0)
# concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
- noise_guidance_edit_tmp = paddle.index_select(
- noise_guidance_edit, warmup_inds, 0)
+ noise_guidance_edit_tmp = paddle.index_select(noise_guidance_edit, warmup_inds, 0)
noise_guidance_edit_tmp = paddle.einsum(
"cb,cbijk->bijk",
concept_weights_tmp,
- noise_guidance_edit_tmp, )
+ noise_guidance_edit_tmp,
+ )
noise_guidance_edit_tmp = noise_guidance_edit_tmp
noise_guidance = noise_guidance + noise_guidance_edit_tmp
@@ -714,17 +695,15 @@ def __call__(
concept_weights = paddle.where(
concept_weights < 0,
paddle.zeros_like(concept_weights),
- concept_weights, )
+ concept_weights,
+ )
# concept_weights = paddle.nan_to_num(concept_weights)
- noise_guidance_edit = paddle.einsum(
- "cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+ noise_guidance_edit = paddle.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
- noise_guidance_edit = (noise_guidance_edit +
- edit_momentum_scale * edit_momentum)
+ noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
- edit_momentum = (edit_mom_beta * edit_momentum +
- (1 - edit_mom_beta) * noise_guidance_edit)
+ edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit
if warmup_inds.shape[0] == len(noise_pred_edit_concepts):
noise_guidance = noise_guidance + noise_guidance_edit
@@ -737,8 +716,7 @@ def __call__(
noise_pred = noise_pred_uncond + noise_guidance
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -748,12 +726,11 @@ def __call__(
image = self.decode_latents(latents)
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=safety_checker_input.pixel_values.cast(
- text_embeddings.dtype), )
+ clip_input=safety_checker_input.pixel_values.cast(text_embeddings.dtype),
+ )
else:
has_nsfw_concept = None
@@ -763,5 +740,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return SemanticStableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
index 44d2a3ed3c947..53dd30da98557 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -12,8 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ...utils import (OptionalDependencyNotAvailable, is_note_seq_available,
- is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_note_seq_available,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
try:
if not (is_paddlenlp_available() and is_paddle_available()):
@@ -23,10 +27,12 @@
else:
from .notes_encoder import SpectrogramNotesEncoder
from .pipeline_spectrogram_diffusion import (
- SpectrogramContEncoder, SpectrogramDiffusionPipeline, T5FilmDecoder)
+ SpectrogramContEncoder,
+ SpectrogramDiffusionPipeline,
+ T5FilmDecoder,
+ )
try:
- if not (is_paddlenlp_available() and is_paddle_available() and
- is_note_seq_available()):
+ if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
index 4378ce01e5784..d09306582dc21 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
@@ -17,28 +17,27 @@
from paddlenlp.transformers.t5.configuration import T5Config
from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
-from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin,
- register_to_config)
+from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
from ...models import ModelMixin
class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
@register_to_config
def __init__(
- self,
- input_dims: int,
- targets_context_length: int,
- d_model: int,
- dropout_rate: float,
- num_layers: int,
- num_heads: int,
- d_kv: int,
- d_ff: int,
- feed_forward_proj: str,
- is_decoder: bool=False, ):
+ self,
+ input_dims: int,
+ targets_context_length: int,
+ d_model: int,
+ dropout_rate: float,
+ num_layers: int,
+ num_heads: int,
+ d_kv: int,
+ d_ff: int,
+ feed_forward_proj: str,
+ is_decoder: bool = False,
+ ):
super().__init__()
- self.input_proj = nn.Linear(
- in_features=input_dims, out_features=d_model, bias_attr=False)
+ self.input_proj = nn.Linear(in_features=input_dims, out_features=d_model, bias_attr=False)
self.position_encoding = nn.Embedding(targets_context_length, d_model)
self.position_encoding.weight.stop_gradient = True
self.dropout_pre = nn.Dropout(p=dropout_rate)
@@ -50,7 +49,8 @@ def __init__(
feed_forward_proj=feed_forward_proj,
dropout_rate=dropout_rate,
is_decoder=is_decoder,
- is_encoder_decoder=False, )
+ is_encoder_decoder=False,
+ )
self.encoders = nn.LayerList()
for lyr_num in range(num_layers):
lyr = T5Block(t5config)
@@ -66,17 +66,13 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
input_positions = paddle.arange(end=max_positions)
seq_lens = encoder_inputs_mask.sum(axis=-1)
- input_positions = paddle.roll(
- x=input_positions.unsqueeze(axis=0),
- shifts=tuple(seq_lens.tolist()),
- axis=0)
+ input_positions = paddle.roll(x=input_positions.unsqueeze(axis=0), shifts=tuple(seq_lens.tolist()), axis=0)
x += self.position_encoding(input_positions)
x = self.dropout_pre(x)
# inverted the attention mask
input_shape = encoder_inputs.shape
- extended_attention_mask = self.get_extended_attention_mask(
- encoder_inputs_mask, input_shape)
+ extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
for lyr in self.encoders:
x = lyr(x, extended_attention_mask)[0]
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
index d8dcc8a98cf87..3997ce07f5845 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -15,8 +15,17 @@
import dataclasses
import math
import os
-from typing import (Any, Callable, List, Mapping, MutableMapping, Optional,
- Sequence, Tuple, Union)
+from typing import (
+ Any,
+ Callable,
+ List,
+ Mapping,
+ MutableMapping,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+)
import numpy as np
import paddle
@@ -96,8 +105,7 @@ class NoteEncodingState:
"""Encoding state for note transcription, keeping track of active pitches."""
# velocity bin for active pitches and programs
- active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(
- default_factory=dict)
+ active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
@dataclasses.dataclass
@@ -149,10 +157,11 @@ class Codec:
"""
def __init__(
- self,
- max_shift_steps: int,
- steps_per_second: float,
- event_ranges: List[EventRange], ):
+ self,
+ max_shift_steps: int,
+ steps_per_second: float,
+ event_ranges: List[EventRange],
+ ):
"""Define Codec.
Args:
@@ -162,14 +171,11 @@ def __init__(
event_ranges: Other supported event types and their ranges.
"""
self.steps_per_second = steps_per_second
- self._shift_range = EventRange(
- type="shift", min_value=0, max_value=max_shift_steps)
+ self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
self._event_ranges = [self._shift_range] + event_ranges
# Ensure all event types have unique names.
- assert len(self._event_ranges) == len(
- {er.type
- for er in self._event_ranges})
+ assert len(self._event_ranges) == len({er.type for er in self._event_ranges})
@property
def num_classes(self) -> int:
@@ -179,8 +185,7 @@ def num_classes(self) -> int:
# events that are intended to be used from within autograph functions.
def is_shift_event_index(self, index: int) -> bool:
- return (self._shift_range.min_value <= index and
- index <= self._shift_range.max_value)
+ return self._shift_range.min_value <= index and index <= self._shift_range.max_value
@property
def max_shift_steps(self) -> int:
@@ -235,31 +240,29 @@ def programs_to_midi_classes(tokens, codec):
"""Modifies program events to be the first program in the MIDI class."""
min_program_id, max_program_id = codec.event_type_range("program")
is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
- return np.where(is_program, min_program_id + 8 * (
- (tokens - min_program_id) // 8), tokens)
+ return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
PROGRAM_GRANULARITIES = {
# "flat" granularity; drop program change tokens and set NoteSequence
# programs to zero
- "flat": ProgramGranularity(
- tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
+ "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
# map each program to the first program in its MIDI class
"midi_class": ProgramGranularity(
tokens_map_fn=programs_to_midi_classes,
- program_map_fn=lambda program: 8 * (program // 8), ),
+ program_map_fn=lambda program: 8 * (program // 8),
+ ),
# leave programs as is
"full": ProgramGranularity(
tokens_map_fn=lambda tokens, codec: tokens,
- program_map_fn=lambda program: program, ),
+ program_map_fn=lambda program: program,
+ ),
}
def unfold(tensor, dimension, size, step=1):
- assert dimension < len(
- tensor.shape), "dimension must be less than tensor dimensions"
- assert (tensor.shape[dimension] >= size
- ), "size should not be greater than the dimension of tensor"
+ assert dimension < len(tensor.shape), "dimension must be less than tensor dimensions"
+ assert tensor.shape[dimension] >= size, "size should not be greater than the dimension of tensor"
slices = []
for i in range(0, tensor.shape[dimension] - size + 1, step):
@@ -276,24 +279,19 @@ def unfold(tensor, dimension, size, step=1):
return unfolded_tensor
-def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0,
- axis=-1):
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
"""
equivalent of tf.signal.frame
"""
signal_length = signal.shape[axis]
if pad_end:
frames_overlap = frame_length - frame_step
- rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
- frame_length - frames_overlap)
+ rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
pad_size = int(frame_length - rest_samples)
if pad_size != 0:
pad_axis = [0] * signal.ndim
pad_axis[axis] = pad_size
- signal = F.pad(x=signal,
- pad=pad_axis,
- mode="constant",
- value=pad_value)
+ signal = F.pad(x=signal, pad=pad_axis, mode="constant", value=pad_value)
frames = unfold(signal, axis, frame_length, frame_step)
return frames
@@ -305,28 +303,26 @@ def program_to_slakh_program(program):
return slakh_program
-def audio_to_frames(
- samples, hop_size: int,
- frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]:
+def audio_to_frames(samples, hop_size: int, frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]:
"""Convert audio samples to non-overlapping frames and frame times."""
frame_size = hop_size
- samples = np.pad(samples, [0, frame_size - len(samples) % frame_size],
- mode="constant")
+ samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
# Split audio into frames.
frames = frame(
paddle.to_tensor(data=samples).unsqueeze(axis=0),
frame_length=frame_size,
frame_step=frame_size,
- pad_end=False, )
+ pad_end=False,
+ )
num_frames = len(samples) // frame_size
times = np.arange(num_frames) / frame_rate
return frames, times
def note_sequence_to_onsets_and_offsets_and_programs(
- ns: note_seq.NoteSequence, ) -> Tuple[Sequence[float], Sequence[
- NoteEventData]]:
+ ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
"""Extract onset & offset times and pitches & programs from a NoteSequence.
The onset & offset times will not necessarily be in sorted order.
@@ -341,21 +337,20 @@ def note_sequence_to_onsets_and_offsets_and_programs(
"""
# Sort by program and pitch and put offsets before onsets as a tiebreaker for
# subsequent stable sort.
- notes = sorted(
- ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
- times = [note.end_time for note in notes if not note.is_drum] + [
- note.start_time for note in notes
- ]
+ notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+ times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
values = [
- NoteEventData(
- pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
- for note in notes if not note.is_drum
+ NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+ for note in notes
+ if not note.is_drum
] + [
NoteEventData(
pitch=note.pitch,
velocity=note.velocity,
program=note.program,
- is_drum=note.is_drum, ) for note in notes
+ is_drum=note.is_drum,
+ )
+ for note in notes
]
return times, values
@@ -368,20 +363,19 @@ def num_velocity_bins_from_codec(codec: Codec):
# segment an array into segments of length n
def segment(a, n):
- return [a[i:i + n] for i in range(0, len(a), n)]
+ return [a[i : i + n] for i in range(0, len(a), n)]
def velocity_to_bin(velocity, num_velocity_bins):
if velocity == 0:
return 0
else:
- return math.ceil(num_velocity_bins * velocity /
- note_seq.MAX_MIDI_VELOCITY)
+ return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
-def note_event_data_to_events(state: Optional[NoteEncodingState],
- value: NoteEventData,
- codec: Codec) -> Sequence[Event]:
+def note_event_data_to_events(
+ state: Optional[NoteEncodingState], value: NoteEventData, codec: Codec
+) -> Sequence[Event]:
"""Convert note event data to a sequence of events."""
if value.velocity is None:
# onsets only, no program or velocity
@@ -393,9 +387,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState],
# onsets + offsets + velocities only, no programs
if state is not None:
state.active_pitches[value.pitch, 0] = velocity_bin
- return [
- Event("velocity", velocity_bin), Event("pitch", value.pitch)
- ]
+ return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
elif value.is_drum:
# drum events use a separate vocabulary
return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
@@ -413,8 +405,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState],
def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
"""Output program and pitch events for active notes plus a final tie event."""
events = []
- for pitch, program in sorted(
- state.active_pitches.keys(), key=lambda k: k[::-1]):
+ for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
if state.active_pitches[pitch, program]:
events += [Event("program", program), Event("pitch", pitch)]
events.append(Event("tie", 0))
@@ -422,13 +413,14 @@ def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
def encode_and_index_events(
- state,
- event_times,
- event_values,
- codec,
- frame_times,
- encode_event_fn,
- encoding_state_to_events_fn=None, ):
+ state,
+ event_times,
+ event_values,
+ codec,
+ frame_times,
+ encode_event_fn,
+ encoding_state_to_events_fn=None,
+):
"""Encode a sequence of timed events and index to audio frame times.
Encodes time shifts as repeated single step shifts for later run length encoding.
@@ -460,9 +452,7 @@ def encode_and_index_events(
state_event_indices: Corresponding state event index for every audio frame.
"""
indices = np.argsort(event_times, kind="stable")
- event_steps = [
- round(event_times[i] * codec.steps_per_second) for i in indices
- ]
+ event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
event_values = [event_values[i] for i in indices]
events = []
state_events = []
@@ -473,9 +463,10 @@ def encode_and_index_events(
cur_state_event_idx = 0
def fill_event_start_indices_to_cur_step():
- while (len(event_start_indices) < len(frame_times) and
- frame_times[len(event_start_indices)] < cur_step /
- codec.steps_per_second):
+ while (
+ len(event_start_indices) < len(frame_times)
+ and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
+ ):
event_start_indices.append(cur_event_idx)
state_event_indices.append(cur_state_event_idx)
@@ -511,28 +502,24 @@ def fill_event_start_indices_to_cur_step():
events = np.array(events).astype(np.int32)
state_events = np.array(state_events).astype(np.int32)
- event_start_indices = segment(
- np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
- event_end_indices = segment(
- np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
- state_event_indices = segment(
- np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+ event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+ event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+ state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
outputs = []
- for start_indices, end_indices, event_indices in zip(
- event_start_indices, event_end_indices, state_event_indices):
- outputs.append({
- "inputs": events,
- "event_start_indices": start_indices,
- "event_end_indices": end_indices,
- "state_events": state_events,
- "state_event_indices": event_indices,
- })
+ for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
+ outputs.append(
+ {
+ "inputs": events,
+ "event_start_indices": start_indices,
+ "event_end_indices": end_indices,
+ "state_events": state_events,
+ "state_event_indices": event_indices,
+ }
+ )
return outputs
-def extract_sequence_with_indices(features,
- state_events_end_token=None,
- feature_key="inputs"):
+def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
"""Extract target sequence corresponding to audio token segment."""
features = features.copy()
start_idx = features["event_start_indices"][0]
@@ -543,36 +530,33 @@ def extract_sequence_with_indices(features,
# prepend them to the targets array.
state_event_start_idx = features["state_event_indices"][0]
state_event_end_idx = state_event_start_idx + 1
- while (features["state_events"][state_event_end_idx - 1] !=
- state_events_end_token):
+ while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
state_event_end_idx += 1
features[feature_key] = np.concatenate(
[
- features["state_events"][state_event_start_idx:
- state_event_end_idx],
+ features["state_events"][state_event_start_idx:state_event_end_idx],
features[feature_key],
],
- axis=0, )
+ axis=0,
+ )
return features
-def map_midi_programs(feature,
- codec: Codec,
- granularity_type: str="full",
- feature_key: str="inputs") -> Mapping[str, Any]:
+def map_midi_programs(
+ feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
+) -> Mapping[str, Any]:
"""Apply MIDI program map to token sequences."""
granularity = PROGRAM_GRANULARITIES[granularity_type]
- feature[feature_key] = granularity.tokens_map_fn(feature[feature_key],
- codec)
+ feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
return feature
def run_length_encode_shifts_fn(
- features,
- codec: Codec,
- feature_key: str="inputs",
- state_change_event_types: Sequence[str]=(
- ), ) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+ features,
+ codec: Codec,
+ feature_key: str = "inputs",
+ state_change_event_types: Sequence[str] = (),
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
"""Return a function that run-length encodes shifts for a given codec.
Args:
@@ -585,13 +569,9 @@ def run_length_encode_shifts_fn(
Returns:
A preprocessing function that run-length encodes single-step shifts.
"""
- state_change_event_ranges = [
- codec.event_type_range(event_type)
- for event_type in state_change_event_types
- ]
+ state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
- def run_length_encode_shifts(
- features: MutableMapping[str, Any]) -> Mapping[str, Any]:
+ def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
"""Combine leading/interior shifts, trim trailing shifts.
Args:
@@ -613,8 +593,7 @@ def run_length_encode_shifts(
# If this event is a state change and has the same value as the current
# state, we can skip it entirely.
is_redundant = False
- for i, (min_index,
- max_index) in enumerate(state_change_event_ranges):
+ for i, (min_index, max_index) in enumerate(state_change_event_ranges):
if min_index <= event and event <= max_index:
if current_state[i] == event:
is_redundant = True
@@ -627,10 +606,8 @@ def run_length_encode_shifts(
if shift_steps > 0:
shift_steps = total_shift_steps
while shift_steps > 0:
- output_steps = np.minimum(codec.max_shift_steps,
- shift_steps)
- output = np.concatenate(
- [output, [output_steps]], axis=0)
+ output_steps = np.minimum(codec.max_shift_steps, shift_steps)
+ output = np.concatenate([output, [output_steps]], axis=0)
shift_steps -= output_steps
output = np.concatenate([output, [event]], axis=0)
features[feature_key] = output
@@ -639,42 +616,32 @@ def run_length_encode_shifts(
return run_length_encode_shifts(features)
-def note_representation_processor_chain(
- features,
- codec: Codec,
- note_representation_config: NoteRepresentationConfig):
+def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
tie_token = codec.encode_event(Event("tie", 0))
- state_events_end_token = (tie_token if
- note_representation_config.include_ties else None)
+ state_events_end_token = tie_token if note_representation_config.include_ties else None
features = extract_sequence_with_indices(
- features,
- state_events_end_token=state_events_end_token,
- feature_key="inputs")
+ features, state_events_end_token=state_events_end_token, feature_key="inputs"
+ )
features = map_midi_programs(features, codec)
- features = run_length_encode_shifts_fn(
- features, codec, state_change_event_types=["velocity", "program"])
+ features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
return features
class MidiProcessor:
def __init__(self):
self.codec = Codec(
- max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS *
- DEFAULT_STEPS_PER_SECOND,
+ max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
steps_per_second=DEFAULT_STEPS_PER_SECOND,
event_ranges=[
- EventRange("pitch", note_seq.MIN_MIDI_PITCH,
- note_seq.MAX_MIDI_PITCH),
+ EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
EventRange("tie", 0, 0),
- EventRange("program", note_seq.MIN_MIDI_PROGRAM,
- note_seq.MAX_MIDI_PROGRAM),
- EventRange("drum", note_seq.MIN_MIDI_PITCH,
- note_seq.MAX_MIDI_PITCH),
- ], )
+ EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+ EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+ ],
+ )
self.tokenizer = Tokenizer(self.codec.num_classes)
- self.note_representation_config = NoteRepresentationConfig(
- onsets_only=False, include_ties=True)
+ self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
def __call__(self, midi: Union[bytes, os.PathLike, str]):
if not isinstance(midi, bytes):
@@ -695,13 +662,10 @@ def __call__(self, midi: Union[bytes, os.PathLike, str]):
frame_times=frame_times,
codec=self.codec,
encode_event_fn=note_event_data_to_events,
- encoding_state_to_events_fn=note_encoding_state_to_events, )
+ encoding_state_to_events_fn=note_encoding_state_to_events,
+ )
events = [
- note_representation_processor_chain(event, self.codec,
- self.note_representation_config)
- for event in events
- ]
- input_tokens = [
- self.tokenizer.encode(event["inputs"]) for event in events
+ note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
]
+ input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
return input_tokens
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
index 73d0d48ee3f28..bcf4c659a6e5f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
@@ -17,25 +17,25 @@
from paddlenlp.transformers.t5.configuration import T5Config
from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
-from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin,
- register_to_config)
+from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
from ...models import ModelMixin
class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
@register_to_config
def __init__(
- self,
- max_length: int,
- vocab_size: int,
- d_model: int,
- dropout_rate: float,
- num_layers: int,
- num_heads: int,
- d_kv: int,
- d_ff: int,
- feed_forward_proj: str,
- is_decoder: bool=False, ):
+ self,
+ max_length: int,
+ vocab_size: int,
+ d_model: int,
+ dropout_rate: float,
+ num_layers: int,
+ num_heads: int,
+ d_kv: int,
+ d_ff: int,
+ feed_forward_proj: str,
+ is_decoder: bool = False,
+ ):
super().__init__()
self.token_embedder = nn.Embedding(vocab_size, d_model)
self.position_encoding = nn.Embedding(max_length, d_model)
@@ -50,7 +50,8 @@ def __init__(
dropout_rate=dropout_rate,
feed_forward_proj=feed_forward_proj,
is_decoder=is_decoder,
- is_encoder_decoder=False, )
+ is_encoder_decoder=False,
+ )
self.encoders = nn.LayerList()
for lyr_num in range(num_layers):
lyr = T5Block(t5config)
@@ -67,8 +68,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
# inverted the attention mask
input_shape = encoder_input_tokens.shape
- extended_attention_mask = self.get_extended_attention_mask(
- encoder_inputs_mask, input_shape)
+ extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
for lyr in self.encoders:
x = lyr(x, extended_attention_mask)[0]
x = self.layer_norm(x)
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index a7c2673f560f3..000fc9a868b02 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -33,12 +33,13 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
_optional_components = ["melgan"]
def __init__(
- self,
- notes_encoder: SpectrogramNotesEncoder,
- continuous_encoder: SpectrogramContEncoder,
- decoder: T5FilmDecoder,
- scheduler: DDPMScheduler,
- melgan: (Any), ) -> None:
+ self,
+ notes_encoder: SpectrogramNotesEncoder,
+ continuous_encoder: SpectrogramContEncoder,
+ decoder: T5FilmDecoder,
+ scheduler: DDPMScheduler,
+ melgan: (Any),
+ ) -> None:
super().__init__()
# From MELGAN
@@ -50,25 +51,23 @@ def __init__(
continuous_encoder=continuous_encoder,
decoder=decoder,
scheduler=scheduler,
- melgan=melgan, )
+ melgan=melgan,
+ )
def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
"""Linearly scale features to network outputs range."""
min_out, max_out = output_range
if clip:
- features = paddle.clip(
- x=features, min=self.min_value, max=self.max_value)
+ features = paddle.clip(x=features, min=self.min_value, max=self.max_value)
# Scale to [0, 1].
- zero_one = (features - self.min_value) / (
- self.max_value - self.min_value)
+ zero_one = (features - self.min_value) / (self.max_value - self.min_value)
# Scale to [min_out, max_out].
return zero_one * (max_out - min_out) + min_out
def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
"""Invert by linearly scaling network outputs to features range."""
min_out, max_out = input_range
- outputs = paddle.clip(
- x=outputs, min=min_out, max=max_out) if clip else outputs
+ outputs = paddle.clip(x=outputs, min=min_out, max=max_out) if clip else outputs
# Scale to [0, 1].
zero_one = (outputs - min_out) / (max_out - min_out)
# Scale to [self.min_value, self.max_value].
@@ -77,29 +76,27 @@ def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
def encode(self, input_tokens, continuous_inputs, continuous_mask):
tokens_mask = input_tokens > 0
tokens_encoded, tokens_mask = self.notes_encoder(
- encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask)
+ encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
+ )
continuous_encoded, continuous_mask = self.continuous_encoder(
- encoder_inputs=continuous_inputs.cast(
- self.continuous_encoder.dtype),
- encoder_inputs_mask=continuous_mask, )
- return [(tokens_encoded, tokens_mask), (continuous_encoded,
- continuous_mask)]
+ encoder_inputs=continuous_inputs.cast(self.continuous_encoder.dtype),
+ encoder_inputs_mask=continuous_mask,
+ )
+ return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
def decode(self, encodings_and_masks, input_tokens, noise_time):
timesteps = noise_time
if not paddle.is_tensor(x=timesteps):
- timesteps = paddle.to_tensor(
- data=[timesteps], dtype="int64", place=input_tokens.place)
+ timesteps = paddle.to_tensor(data=[timesteps], dtype="int64", place=input_tokens.place)
elif paddle.is_tensor(x=timesteps) and len(timesteps.shape) == 0:
if isinstance(input_tokens.place, paddle.dtype):
dtype = input_tokens.place
- elif isinstance(input_tokens.place,
- str) and input_tokens.place not in [
- "cpu",
- "cuda",
- "ipu",
- "xpu",
- ]:
+ elif isinstance(input_tokens.place, str) and input_tokens.place not in [
+ "cpu",
+ "cuda",
+ "ipu",
+ "xpu",
+ ]:
dtype = input_tokens.place
elif isinstance(input_tokens.place, paddle.Tensor):
dtype = input_tokens.place.dtype
@@ -107,40 +104,41 @@ def decode(self, encodings_and_masks, input_tokens, noise_time):
dtype = timesteps[None].dtype
timesteps = timesteps[None].cast(dtype)
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps * paddle.ones(
- shape=input_tokens.shape[0], dtype=timesteps.dtype)
+ timesteps = timesteps * paddle.ones(shape=input_tokens.shape[0], dtype=timesteps.dtype)
logits = self.decoder(
encodings_and_masks=encodings_and_masks,
decoder_input_tokens=input_tokens,
- decoder_noise_time=timesteps, )
+ decoder_noise_time=timesteps,
+ )
return logits
@paddle.no_grad()
def __call__(
- self,
- input_tokens: List[List[int]],
- generator: Optional[paddle.Generator]=None,
- num_inference_steps: int=100,
- return_dict: bool=True,
- output_type: str="numpy",
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1, ) -> Union[AudioPipelineOutput, Tuple]:
- if (callback_steps is None or callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ self,
+ input_tokens: List[List[int]],
+ generator: Optional[paddle.Generator] = None,
+ num_inference_steps: int = 100,
+ return_dict: bool = True,
+ output_type: str = "numpy",
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ ) -> Union[AudioPipelineOutput, Tuple]:
+ if (
+ callback_steps is None
+ or callback_steps is not None
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
)
- pred_mel = np.zeros(
- [1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
+ pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
ones = paddle.ones(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
for i, encoder_input_tokens in enumerate(input_tokens):
if i == 0:
- encoder_continuous_inputs = paddle.to_tensor(
- data=pred_mel[:1].copy()).cast(self.decoder.dtype)
+ encoder_continuous_inputs = paddle.to_tensor(data=pred_mel[:1].copy()).cast(self.decoder.dtype)
# The first chunk has no previous context.
- encoder_continuous_mask = paddle.zeros(
- shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
+ encoder_continuous_mask = paddle.zeros(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
else:
# The full song pipeline does not feed in a context feature, so the mask
# will be all 0s after the feature converter. Because we know we're
@@ -148,17 +146,19 @@ def __call__(
# to all 1s.
encoder_continuous_mask = ones
encoder_continuous_inputs = self.scale_features(
- encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
+ encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+ )
encodings_and_masks = self.encode(
- input_tokens=paddle.to_tensor(
- data=[encoder_input_tokens], dtype="int32"),
+ input_tokens=paddle.to_tensor(data=[encoder_input_tokens], dtype="int32"),
continuous_inputs=encoder_continuous_inputs,
- continuous_mask=encoder_continuous_mask, )
+ continuous_mask=encoder_continuous_mask,
+ )
# Sample encoder_continuous_inputs shaped gaussian noise to begin loop
x = randn_tensor(
shape=encoder_continuous_inputs.shape,
generator=generator,
- dtype=self.decoder.dtype, )
+ dtype=self.decoder.dtype,
+ )
# set step values
self.scheduler.set_timesteps(num_inference_steps)
# Denoising diffusion loop
@@ -166,26 +166,24 @@ def __call__(
output = self.decode(
encodings_and_masks=encodings_and_masks,
input_tokens=x,
- noise_time=t / self.scheduler.config.num_train_timesteps, )
+ noise_time=t / self.scheduler.config.num_train_timesteps,
+ )
# Compute previous output: x_t -> x_t-1
- x = self.scheduler.step(
- output, t, x, generator=generator).prev_sample
+ x = self.scheduler.step(output, t, x, generator=generator).prev_sample
mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
encoder_continuous_inputs = mel[:1]
pred_mel = mel.cpu().astype(dtype="float32").numpy()
- full_pred_mel = np.concatenate(
- [full_pred_mel, pred_mel[:1]], axis=1)
+ full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
callback(i, full_pred_mel)
logger.info("Generated segment", i)
if output_type == "numpy":
- output = self.melgan(
- input_features=full_pred_mel.astype(np.float32))[0]
+ output = self.melgan(input_features=full_pred_mel.astype(np.float32))[0]
else:
output = full_pred_mel
if not return_dict:
- return (output, )
+ return (output,)
return AudioPipelineOutput(audios=output)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
index 5bcf303c00772..fa4dcc515380f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
@@ -19,10 +19,15 @@
import numpy as np
import PIL.Image
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
- is_fastdeploy_available, is_k_diffusion_available,
- is_k_diffusion_version, is_paddle_available,
- is_paddlenlp_available)
+from ...utils import (
+ BaseOutput,
+ OptionalDependencyNotAvailable,
+ is_fastdeploy_available,
+ is_k_diffusion_available,
+ is_k_diffusion_version,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
@dataclass
@@ -51,44 +56,46 @@ class StableDiffusionPipelineOutput(BaseOutput):
else:
# new added
from .hf_clip_model import (
- HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection,
- HFCLIPVisionModel, HFCLIPVisionModelWithProjection)
+ HFCLIPModel,
+ HFCLIPTextModel,
+ HFCLIPTextModelWithProjection,
+ HFCLIPVisionModel,
+ HFCLIPVisionModelWithProjection,
+ )
from .pipeline_cycle_diffusion import CycleDiffusionPipeline
from .pipeline_stable_diffusion import StableDiffusionPipeline
- from .pipeline_stable_diffusion_adapter import \
- StableDiffusionAdapterPipeline
- from .pipeline_stable_diffusion_all_in_one import \
- StableDiffusionPipelineAllinOne
- from .pipeline_stable_diffusion_attend_and_excite import \
- StableDiffusionAttendAndExcitePipeline
- from .pipeline_stable_diffusion_controlnet import \
- StableDiffusionControlNetPipeline
- from .pipeline_stable_diffusion_depth2img import \
- StableDiffusionDepth2ImgPipeline
- from .pipeline_stable_diffusion_image_variation import \
- StableDiffusionImageVariationPipeline
- from .pipeline_stable_diffusion_img2img import \
- StableDiffusionImg2ImgPipeline
- from .pipeline_stable_diffusion_inpaint import \
- StableDiffusionInpaintPipeline
- from .pipeline_stable_diffusion_inpaint_legacy import \
- StableDiffusionInpaintPipelineLegacy
- from .pipeline_stable_diffusion_instruct_pix2pix import \
- StableDiffusionInstructPix2PixPipeline
- from .pipeline_stable_diffusion_k_diffusion import \
- StableDiffusionKDiffusionPipeline
- from .pipeline_stable_diffusion_latent_upscale import \
- StableDiffusionLatentUpscalePipeline
+ from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
+ from .pipeline_stable_diffusion_all_in_one import StableDiffusionPipelineAllinOne
+ from .pipeline_stable_diffusion_attend_and_excite import (
+ StableDiffusionAttendAndExcitePipeline,
+ )
+ from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
+ from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
+ from .pipeline_stable_diffusion_image_variation import (
+ StableDiffusionImageVariationPipeline,
+ )
+ from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+ from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+ from .pipeline_stable_diffusion_inpaint_legacy import (
+ StableDiffusionInpaintPipelineLegacy,
+ )
+ from .pipeline_stable_diffusion_instruct_pix2pix import (
+ StableDiffusionInstructPix2PixPipeline,
+ )
+ from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+ from .pipeline_stable_diffusion_latent_upscale import (
+ StableDiffusionLatentUpscalePipeline,
+ )
from .pipeline_stable_diffusion_mega import StableDiffusionMegaPipeline
- from .pipeline_stable_diffusion_model_editing import \
- StableDiffusionModelEditingPipeline
- from .pipeline_stable_diffusion_panorama import \
- StableDiffusionPanoramaPipeline
- from .pipeline_stable_diffusion_pix2pix_zero import \
- StableDiffusionPix2PixZeroPipeline
+ from .pipeline_stable_diffusion_model_editing import (
+ StableDiffusionModelEditingPipeline,
+ )
+ from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+ from .pipeline_stable_diffusion_pix2pix_zero import (
+ StableDiffusionPix2PixZeroPipeline,
+ )
from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
- from .pipeline_stable_diffusion_upscale import \
- StableDiffusionUpscalePipeline
+ from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
from .pipeline_stable_unclip import StableUnCLIPPipeline
from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
from .safety_checker import StableDiffusionSafetyChecker
@@ -100,21 +107,26 @@ class StableDiffusionPipelineOutput(BaseOutput):
except OptionalDependencyNotAvailable:
from ...utils.dummy_fastdeploy_objects import * # noqa F403
else:
- from .pipeline_fastdeploy_cycle_diffusion import \
- FastDeployCycleDiffusionPipeline
- from .pipeline_fastdeploy_stable_diffusion import \
- FastDeployStableDiffusionPipeline
- from .pipeline_fastdeploy_stable_diffusion_controlnet import \
- FastDeployStableDiffusionControlNetPipeline
- from .pipeline_fastdeploy_stable_diffusion_image_variation import \
- FastDeployStableDiffusionImageVariationPipeline
- from .pipeline_fastdeploy_stable_diffusion_img2img import \
- FastDeployStableDiffusionImg2ImgPipeline
- from .pipeline_fastdeploy_stable_diffusion_inpaint import \
- FastDeployStableDiffusionInpaintPipeline
- from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \
- FastDeployStableDiffusionInpaintPipelineLegacy
- from .pipeline_fastdeploy_stable_diffusion_mega import \
- FastDeployStableDiffusionMegaPipeline
- from .pipeline_fastdeploy_stable_diffusion_upscale import \
- FastDeployStableDiffusionUpscalePipeline
+ from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
+ from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
+ from .pipeline_fastdeploy_stable_diffusion_controlnet import (
+ FastDeployStableDiffusionControlNetPipeline,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_image_variation import (
+ FastDeployStableDiffusionImageVariationPipeline,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_img2img import (
+ FastDeployStableDiffusionImg2ImgPipeline,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_inpaint import (
+ FastDeployStableDiffusionInpaintPipeline,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
+ FastDeployStableDiffusionInpaintPipelineLegacy,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_mega import (
+ FastDeployStableDiffusionMegaPipeline,
+ )
+ from .pipeline_fastdeploy_stable_diffusion_upscale import (
+ FastDeployStableDiffusionUpscalePipeline,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 1b9ac762bae8a..3f1cbee1f4454 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -22,21 +22,37 @@
import numpy as np
import requests
from paddlenlp.transformers import (
- BertTokenizer, CLIPFeatureExtractor, CLIPImageProcessor, CLIPTextModel,
- CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig,
- CLIPVisionModelWithProjection)
-
-from ...models import (AutoencoderKL, ControlNetModel, PriorTransformer,
- UNet2DConditionModel)
+ BertTokenizer,
+ CLIPFeatureExtractor,
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+ AutoencoderKL,
+ ControlNetModel,
+ PriorTransformer,
+ UNet2DConditionModel,
+)
from ...schedulers import (
- DDIMScheduler, DDPMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, UnCLIPScheduler)
+ DDIMScheduler,
+ DDPMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UnCLIPScheduler,
+)
from ...utils import is_omegaconf_available, logging
from ...utils.import_utils import BACKENDS_MAPPING
from ...utils.load_utils import smart_load
-from ..latent_diffusion.pipeline_latent_diffusion import (LDMBertConfig,
- LDMBertModel)
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
from ..paint_by_example import PaintByExampleImageEncoder
from ..pipeline_utils import DiffusionPipeline
from .safety_checker import StableDiffusionSafetyChecker
@@ -70,8 +86,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -87,8 +102,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -131,8 +145,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -140,21 +153,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
attention layers, and takes into account additional replacements that may arise.
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -162,13 +174,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = np.split(old_tensor, 3, axis=1)
checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -179,8 +189,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -190,8 +199,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -212,9 +220,7 @@ def conv_attn_to_linear(checkpoint):
checkpoint[key] = checkpoint[key][:, :, 0]
-def create_unet_diffusers_config(original_config,
- image_size: int,
- controlnet=False):
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
"""
Creates a config for the diffusers based on the config of the LDM model.
"""
@@ -225,34 +231,28 @@ def create_unet_diffusers_config(original_config,
vae_params = original_config.model.params.first_stage_config.params.ddconfig
- block_out_channels = [
- unet_params.model_channels * mult for mult in unet_params.channel_mult
- ]
+ block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
down_block_types = []
resolution = 1
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnDownBlock2D"
- if resolution in unet_params.attention_resolutions else
- "DownBlock2D")
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
down_block_types.append(block_type)
if i != len(block_out_channels) - 1:
resolution *= 2
up_block_types = []
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnUpBlock2D"
- if resolution in unet_params.attention_resolutions else
- "UpBlock2D")
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
up_block_types.append(block_type)
resolution //= 2
- vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+ vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
head_dim = unet_params.num_heads if "num_heads" in unet_params else None
- use_linear_projection = (unet_params.use_linear_in_transformer
- if "use_linear_in_transformer" in unet_params else
- False)
+ use_linear_projection = (
+ unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+ )
if use_linear_projection:
# stable diffusion 2-base-512 and 2-768
if head_dim is None:
@@ -267,9 +267,7 @@ def create_unet_diffusers_config(original_config,
assert "adm_in_channels" in unet_params
projection_class_embeddings_input_dim = unet_params.adm_in_channels
else:
- raise NotImplementedError(
- f"Unknown conditional unet num_classes config: {unet_params.num_classes}"
- )
+ raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
config = {
"sample_size": image_size // vae_scale_factor,
@@ -281,8 +279,7 @@ def create_unet_diffusers_config(original_config,
"attention_head_dim": head_dim,
"use_linear_projection": use_linear_projection,
"class_embed_type": class_embed_type,
- "projection_class_embeddings_input_dim":
- projection_class_embeddings_input_dim,
+ "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
}
if not controlnet:
@@ -328,7 +325,8 @@ def create_diffusers_schedular(original_config):
num_train_timesteps=original_config.model.params.timesteps,
beta_start=original_config.model.params.linear_start,
beta_end=original_config.model.params.linear_end,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
return schedular
@@ -347,17 +345,19 @@ def create_ldm_bert_config(original_config):
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
- pad_token_id=0, )
+ pad_token_id=0,
+ )
return LDMBertConfig(**config)
def convert_ldm_unet_checkpoint(
- checkpoint,
- config,
- path=None,
- extract_ema=False,
- controlnet=False,
- no_unet_key=False, ):
+ checkpoint,
+ config,
+ path=None,
+ extract_ema=False,
+ controlnet=False,
+ no_unet_key=False,
+):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -384,8 +384,7 @@ def convert_ldm_unet_checkpoint(
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
if sum(k.startswith("model_ema") for k in keys) > 100:
print(
@@ -399,34 +398,23 @@ def convert_ldm_unet_checkpoint(
new_checkpoint = {}
- new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
- "time_embed.0.weight"]
- new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
- "time_embed.0.bias"]
- new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
- "time_embed.2.weight"]
- new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
- "time_embed.2.bias"]
+ new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+ new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+ new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+ new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
if config["class_embed_type"] is None:
# No parameters to port
...
- elif (config["class_embed_type"] == "timestep" or
- config["class_embed_type"] == "projection"):
- new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[
- "label_emb.0.0.weight"]
- new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[
- "label_emb.0.0.bias"]
- new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[
- "label_emb.0.2.weight"]
- new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[
- "label_emb.0.2.bias"]
+ elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+ new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+ new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+ new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+ new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
else:
- raise NotImplementedError(
- f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+ raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
- new_checkpoint["conv_in.weight"] = unet_state_dict[
- "input_blocks.0.0.weight"]
+ new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
if not controlnet:
@@ -436,35 +424,23 @@ def convert_ldm_unet_checkpoint(
new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
# Retrieves the keys for the input blocks only
- num_input_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "input_blocks" in layer
- })
+ num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
input_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
for layer_id in range(num_input_blocks)
}
# Retrieves the keys for the middle blocks only
- num_middle_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "middle_block" in layer
- })
+ num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
middle_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
for layer_id in range(num_middle_blocks)
}
# Retrieves the keys for the output blocks only
- num_output_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "output_blocks" in layer
- })
+ num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
output_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
for layer_id in range(num_output_blocks)
}
@@ -473,21 +449,17 @@ def convert_ldm_unet_checkpoint(
layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
resnets = [
- key for key in input_blocks[i]
- if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
- key
- ]
- attentions = [
- key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+ key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
]
+ attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.weight")
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.bias")
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.weight"
+ )
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.bias"
+ )
paths = renew_resnet_paths(resnets)
meta_path = {
@@ -499,7 +471,8 @@ def convert_ldm_unet_checkpoint(
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if len(attentions):
paths = renew_attention_paths(attentions)
@@ -512,19 +485,18 @@ def convert_ldm_unet_checkpoint(
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
resnet_0 = middle_blocks[0]
attentions = middle_blocks[1]
resnet_1 = middle_blocks[2]
resnet_0_paths = renew_resnet_paths(resnet_0)
- assign_to_checkpoint(
- resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
resnet_1_paths = renew_resnet_paths(resnet_1)
- assign_to_checkpoint(
- resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
attentions_paths = renew_attention_paths(attentions)
meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -533,14 +505,13 @@ def convert_ldm_unet_checkpoint(
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
for i in range(num_output_blocks):
block_id = i // (config["layers_per_block"] + 1)
layer_in_block_id = i % (config["layers_per_block"] + 1)
- output_block_layers = [
- shave_segments(name, 2) for name in output_blocks[i]
- ]
+ output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
output_block_list = {}
for layer in output_block_layers:
@@ -551,12 +522,8 @@ def convert_ldm_unet_checkpoint(
output_block_list[layer_id] = [layer_name]
if len(output_block_list) > 1:
- resnets = [
- key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
- ]
- attentions = [
- key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
- ]
+ resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+ attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
resnet_0_paths = renew_resnet_paths(resnets)
paths = renew_resnet_paths(resnets)
@@ -570,22 +537,19 @@ def convert_ldm_unet_checkpoint(
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- output_block_list = {
- k: sorted(v)
- for k, v in output_block_list.items()
- }
+ output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
if ["conv.bias", "conv.weight"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.bias", "conv.weight"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
@@ -595,27 +559,28 @@ def convert_ldm_unet_checkpoint(
paths = renew_attention_paths(attentions)
meta_path = {
"old": f"output_blocks.{i}.1",
- "new":
- f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+ "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
}
assign_to_checkpoint(
paths,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
else:
- resnet_0_paths = renew_resnet_paths(
- output_block_layers, n_shave_prefix_segments=1)
+ resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
for path in resnet_0_paths:
old_path = ".".join(["output_blocks", str(i), path["old"]])
- new_path = ".".join([
- "up_blocks",
- str(block_id),
- "resnets",
- str(layer_in_block_id),
- path["new"],
- ])
+ new_path = ".".join(
+ [
+ "up_blocks",
+ str(block_id),
+ "resnets",
+ str(layer_in_block_id),
+ path["new"],
+ ]
+ )
new_checkpoint[new_path] = unet_state_dict[old_path]
@@ -624,48 +589,42 @@ def convert_ldm_unet_checkpoint(
orig_index = 0
- new_checkpoint[
- "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
orig_index += 2
diffusers_index = 0
while diffusers_index < 6:
- new_checkpoint[
- f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
diffusers_index += 1
orig_index += 2
- new_checkpoint[
- "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
# down blocks
for i in range(num_input_blocks):
- new_checkpoint[
- f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(
- f"zero_convs.{i}.0.weight")
- new_checkpoint[
- f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(
- f"zero_convs.{i}.0.bias")
+ new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+ new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
# mid block
- new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop(
- "middle_block_out.0.weight")
- new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop(
- "middle_block_out.0.bias")
+ new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+ new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
return new_checkpoint
@@ -681,107 +640,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -789,58 +715,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -848,13 +766,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
import paddle.nn as nn
need_transpose = []
@@ -880,52 +798,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
new_checkpoint = {}
- new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
- "transformer.token_emb.weight"]
- new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
- "transformer.pos_emb.emb.weight"]
+ new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
+ new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
for i in range(config.encoder_layers):
double_i = 2 * i
double_i_plus1 = 2 * i + 1
# convert norm
new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.weight"]
+ f"transformer.attn_layers.layers.{double_i}.0.weight"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.bias"]
-
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"]
+ f"transformer.attn_layers.layers.{double_i}.0.bias"
+ ]
+
+ new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
+ ].T
new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
+ ].T
new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+ ].T
- new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
- "transformer.norm.weight"]
- new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
- "transformer.norm.bias"]
+ new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
+ new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
ldmbert = LDMBertModel(config)
ldmbert.eval()
ldmbert.load_dict(new_checkpoint)
@@ -942,12 +864,10 @@ def convert_ldm_clip_checkpoint(checkpoint):
for key in keys:
if key.startswith("cond_stage_model.transformer"):
- text_model_dict[key[len(
- "cond_stage_model.transformer."):]] = checkpoint[key]
+ text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
if len(text_model_dict) > 0:
- text_model.load_dict(
- CLIPTextModel.smart_convert(text_model_dict, text_model))
+ text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
return text_model
@@ -955,14 +875,14 @@ def convert_ldm_clip_checkpoint(checkpoint):
textenc_conversion_lst = [
(
"cond_stage_model.model.positional_embedding",
- "text_model.embeddings.position_embedding.weight", ),
+ "text_model.embeddings.position_embedding.weight",
+ ),
(
"cond_stage_model.model.token_embedding.weight",
- "text_model.embeddings.token_embedding.weight", ),
- ("cond_stage_model.model.ln_final.weight",
- "text_model.final_layer_norm.weight"),
- ("cond_stage_model.model.ln_final.bias",
- "text_model.final_layer_norm.bias"),
+ "text_model.embeddings.token_embedding.weight",
+ ),
+ ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+ ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
]
textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
@@ -977,10 +897,12 @@ def convert_ldm_clip_checkpoint(checkpoint):
("ln_final.", "transformer.text_model.final_layer_norm."),
(
"token_embedding.weight",
- "transformer.text_model.embeddings.token_embedding.weight", ),
+ "transformer.text_model.embeddings.token_embedding.weight",
+ ),
(
"positional_embedding",
- "transformer.text_model.embeddings.position_embedding.weight", ),
+ "transformer.text_model.embeddings.position_embedding.weight",
+ ),
]
protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
textenc_pattern = re.compile("|".join(protected.keys()))
@@ -997,12 +919,11 @@ def convert_paint_by_example_checkpoint(checkpoint):
for key in keys:
if key.startswith("cond_stage_model.transformer"):
- model_dict[key[len("cond_stage_model.transformer."):]] = checkpoint[
- key]
+ model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
# load mapper
keys_mapper = {
- k[len("cond_stage_model.mapper.res"):]: v
+ k[len("cond_stage_model.mapper.res") :]: v
for k, v in checkpoint.items()
if k.startswith("cond_stage_model.mapper")
}
@@ -1017,7 +938,7 @@ def convert_paint_by_example_checkpoint(checkpoint):
}
for key, value in keys_mapper.items():
- prefix = key[:len("blocks.i")]
+ prefix = key[: len("blocks.i")]
suffix = key.split(prefix)[-1].split(".")[-1]
name = key.split(prefix)[-1].split(suffix)[0][1:-1]
mapped_names = MAPPING[name]
@@ -1026,13 +947,11 @@ def convert_paint_by_example_checkpoint(checkpoint):
for i, mapped_name in enumerate(mapped_names):
new_name = ".".join([prefix, mapped_name, suffix])
shape = value.shape[0] // num_splits
- model_dict[new_name] = value[i * shape:(i + 1) * shape]
+ model_dict[new_name] = value[i * shape : (i + 1) * shape]
# load final layer norm
- model_dict["final_layer_norm.bias"] = checkpoint[
- "cond_stage_model.final_ln.bias"]
- model_dict["final_layer_norm.weight"] = checkpoint[
- "cond_stage_model.final_ln.bias"]
+ model_dict["final_layer_norm.bias"] = checkpoint["cond_stage_model.final_ln.bias"]
+ model_dict["final_layer_norm.weight"] = checkpoint["cond_stage_model.final_ln.bias"]
# load proj_out
model_dict["proj_out.bias"] = checkpoint["proj_out.bias"]
@@ -1042,64 +961,50 @@ def convert_paint_by_example_checkpoint(checkpoint):
model_dict["uncond_vector"] = checkpoint["learnable_vector"]
if len(model_dict) > 0:
- model.load_dict(
- PaintByExampleImageEncoder.smart_convert(model_dict, model))
+ model.load_dict(PaintByExampleImageEncoder.smart_convert(model_dict, model))
return model
def convert_open_clip_checkpoint(checkpoint):
- text_model = CLIPTextModel.from_pretrained(
- "stabilityai/stable-diffusion-2", subfolder="text_encoder")
+ text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
text_model.eval()
keys = list(checkpoint.keys())
text_model_dict = {}
if "cond_stage_model.model.text_projection" in checkpoint:
- d_model = int(checkpoint["cond_stage_model.model.text_projection"]
- .shape[0])
+ d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
else:
d_model = 1024
# text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
for key in keys:
- if ("resblocks.23" in
- key): # Diffusers drops the final layer and only uses the penultimate layer
+ if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer
continue
if key in textenc_conversion_map:
text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
if key.startswith("cond_stage_model.model.transformer."):
- new_key = key[len("cond_stage_model.model.transformer."):]
+ new_key = key[len("cond_stage_model.model.transformer.") :]
if new_key.endswith(".in_proj_weight"):
- new_key = new_key[:-len(".in_proj_weight")]
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
- text_model_dict[new_key + ".q_proj.weight"] = checkpoint[
- key][:d_model, :]
- text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][
- d_model:d_model * 2, :]
- text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][
- d_model * 2:, :]
+ new_key = new_key[: -len(".in_proj_weight")]
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+ text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+ text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+ text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
elif new_key.endswith(".in_proj_bias"):
- new_key = new_key[:-len(".in_proj_bias")]
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
- text_model_dict[new_key + ".q_proj.bias"] = checkpoint[
- key][:d_model]
- text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][
- d_model:d_model * 2]
- text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][
- d_model * 2:]
+ new_key = new_key[: -len(".in_proj_bias")]
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+ text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+ text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+ text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
else:
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
text_model_dict[new_key] = checkpoint[key]
if len(text_model_dict) > 0:
- text_model.load_dict(
- CLIPTextModel.smart_convert(text_model_dict, text_model))
+ text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
return text_model
@@ -1121,17 +1026,13 @@ def stable_unclip_image_encoder(original_config):
if clip_model_name == "ViT-L/14":
feature_extractor = CLIPImageProcessor()
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
- "openai/clip-vit-large-patch14")
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
else:
- raise NotImplementedError(
- f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}"
- )
+ raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
feature_extractor = CLIPImageProcessor()
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
- "laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
else:
raise NotImplementedError(
f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
@@ -1141,8 +1042,9 @@ def stable_unclip_image_encoder(original_config):
def stable_unclip_image_noising_components(
- original_config,
- clip_stats_path: Optional[str]=None, ):
+ original_config,
+ clip_stats_path: Optional[str] = None,
+):
"""
Returns the noising components for the img2img and txt2img unclip pipelines.
@@ -1162,15 +1064,12 @@ def stable_unclip_image_noising_components(
max_noise_level = noise_aug_config.noise_schedule_config.timesteps
beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
- image_normalizer = StableUnCLIPImageNormalizer(
- embedding_dim=embedding_dim)
- image_noising_scheduler = DDPMScheduler(
- num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+ image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+ image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
if "clip_stats_path" in noise_aug_config:
if clip_stats_path is None:
- raise ValueError(
- "This stable unclip config requires a `clip_stats_path`")
+ raise ValueError("This stable unclip config requires a `clip_stats_path`")
from ...utils import torch_load
@@ -1189,22 +1088,21 @@ def stable_unclip_image_noising_components(
image_normalizer.load_dict(clip_stats_state_dict)
else:
- raise NotImplementedError(
- f"Unknown noise augmentor class: {noise_aug_class}")
+ raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
image_normalizer.eval()
return image_normalizer, image_noising_scheduler
def convert_controlnet_checkpoint(
- checkpoint,
- original_config,
- checkpoint_path,
- image_size,
- upcast_attention,
- extract_ema,
- no_unet_key=False, ):
- ctrlnet_config = create_unet_diffusers_config(
- original_config, image_size=image_size, controlnet=True)
+ checkpoint,
+ original_config,
+ checkpoint_path,
+ image_size,
+ upcast_attention,
+ extract_ema,
+ no_unet_key=False,
+):
+ ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
ctrlnet_config["upcast_attention"] = upcast_attention
ctrlnet_config.pop("sample_size")
@@ -1217,33 +1115,33 @@ def convert_controlnet_checkpoint(
path=checkpoint_path,
extract_ema=extract_ema,
controlnet=True,
- no_unet_key=no_unet_key, )
+ no_unet_key=no_unet_key,
+ )
- controlnet_model.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model,
- converted_ctrl_checkpoint))
+ controlnet_model.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint))
controlnet_model.eval()
return controlnet_model
def download_from_original_stable_diffusion_ckpt(
- checkpoint_path: str,
- original_config_file: str=None,
- image_size: int=512,
- prediction_type: str=None,
- model_type: str=None,
- extract_ema: bool=False,
- scheduler_type: str="pndm",
- num_in_channels: Optional[int]=None,
- upcast_attention: Optional[bool]=None,
- stable_unclip: Optional[str]=None,
- stable_unclip_prior: Optional[str]=None,
- clip_stats_path: Optional[str]=None,
- controlnet: Optional[bool]=None,
- load_safety_checker: bool=True,
- pipeline_class: DiffusionPipeline=None,
- paddle_dtype=None,
- **kwargs, ) -> DiffusionPipeline:
+ checkpoint_path: str,
+ original_config_file: str = None,
+ image_size: int = 512,
+ prediction_type: str = None,
+ model_type: str = None,
+ extract_ema: bool = False,
+ scheduler_type: str = "pndm",
+ num_in_channels: Optional[int] = None,
+ upcast_attention: Optional[bool] = None,
+ stable_unclip: Optional[str] = None,
+ stable_unclip_prior: Optional[str] = None,
+ clip_stats_path: Optional[str] = None,
+ controlnet: Optional[bool] = None,
+ load_safety_checker: bool = True,
+ pipeline_class: DiffusionPipeline = None,
+ paddle_dtype=None,
+ **kwargs,
+) -> DiffusionPipeline:
"""
Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
config file.
@@ -1288,10 +1186,14 @@ def download_from_original_stable_diffusion_ckpt(
"""
# import pipelines here to avoid circular import error when using from_ckpt method
- from ppdiffusers import (LDMTextToImagePipeline, PaintByExamplePipeline,
- StableDiffusionControlNetPipeline,
- StableDiffusionPipeline,
- StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline)
+ from ppdiffusers import (
+ LDMTextToImagePipeline,
+ PaintByExamplePipeline,
+ StableDiffusionControlNetPipeline,
+ StableDiffusionPipeline,
+ StableUnCLIPImg2ImgPipeline,
+ StableUnCLIPPipeline,
+ )
if pipeline_class is None or pipeline_class.__name__ == "DiffusionPipeline":
pipeline_class = StableDiffusionPipeline
@@ -1304,8 +1206,7 @@ def download_from_original_stable_diffusion_ckpt(
from omegaconf import OmegaConf
- checkpoint = smart_load(
- checkpoint_path, return_numpy=True, return_global_step=True)
+ checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
# NOTE: this while loop isn't great but this controlnet checkpoint has one additional
# "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
@@ -1347,11 +1248,12 @@ def download_from_original_stable_diffusion_ckpt(
original_config = OmegaConf.load(original_config_file)
if num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
- if ("parameterization" in original_config["model"]["params"] and
- original_config["model"]["params"]["parameterization"] == "v"):
+ if (
+ "parameterization" in original_config["model"]["params"]
+ and original_config["model"]["params"]["parameterization"] == "v"
+ ):
if prediction_type is None:
# NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
# as it relies on a brittle global step parameter here
@@ -1376,7 +1278,8 @@ def download_from_original_stable_diffusion_ckpt(
checkpoint_path,
image_size,
upcast_attention,
- extract_ema, )
+ extract_ema,
+ )
num_train_timesteps = original_config.model.params.timesteps
beta_start = original_config.model.params.linear_start
beta_end = original_config.model.params.linear_end
@@ -1389,7 +1292,8 @@ def download_from_original_stable_diffusion_ckpt(
steps_offset=1,
clip_sample=False,
set_alpha_to_one=False,
- prediction_type=prediction_type, )
+ prediction_type=prediction_type,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -1404,8 +1308,7 @@ def download_from_original_stable_diffusion_ckpt(
elif scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif scheduler_type == "ddim":
@@ -1414,40 +1317,31 @@ def download_from_original_stable_diffusion_ckpt(
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
# Convert the UNet2DConditionModel model.
- unet_config = create_unet_diffusers_config(
- original_config, image_size=image_size)
+ unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
unet_config["upcast_attention"] = upcast_attention
unet = UNet2DConditionModel(**unet_config)
unet.eval()
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
- checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema)
- unet.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(unet,
- converted_unet_checkpoint))
+ checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+ )
+ unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
# Convert the VAE model.
- vae_config = create_vae_diffusers_config(
- original_config, image_size=image_size)
- converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+ converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config)
vae.eval()
- vae.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(vae,
- converted_vae_checkpoint))
+ vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
# Convert the text model.
if model_type is None:
- model_type = original_config.model.params.cond_stage_config.target.split(
- ".")[-1]
- logger.debug(
- f"no `model_type` given, `model_type` inferred as: {model_type}")
+ model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+ logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
if model_type == "FrozenOpenCLIPEmbedder":
text_model = convert_open_clip_checkpoint(checkpoint)
- tokenizer = CLIPTokenizer.from_pretrained(
- "stabilityai/stable-diffusion-2/tokenizer")
+ tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
if stable_unclip is None:
if controlnet:
@@ -1460,7 +1354,8 @@ def download_from_original_stable_diffusion_ckpt(
controlnet=controlnet_model,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
else:
pipe = pipeline_class(
vae=vae,
@@ -1470,18 +1365,16 @@ def download_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
else:
- (
- image_normalizer,
- image_noising_scheduler,
- ) = stable_unclip_image_noising_components(
+ (image_normalizer, image_noising_scheduler,) = stable_unclip_image_noising_components(
original_config,
- clip_stats_path=clip_stats_path, )
+ clip_stats_path=clip_stats_path,
+ )
if stable_unclip == "img2img":
- feature_extractor, image_encoder = stable_unclip_image_encoder(
- original_config)
+ feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
pipe = StableUnCLIPImg2ImgPipeline(
# image encoding components
@@ -1496,26 +1389,20 @@ def download_from_original_stable_diffusion_ckpt(
unet=unet,
scheduler=scheduler,
# vae
- vae=vae, )
+ vae=vae,
+ )
elif stable_unclip == "txt2img":
if stable_unclip_prior is None or stable_unclip_prior == "karlo":
karlo_model = "kakaobrain/karlo-v1-alpha"
- prior = PriorTransformer.from_pretrained(
- karlo_model, subfolder="prior")
-
- prior_tokenizer = CLIPTokenizer.from_pretrained(
- "openai/clip-vit-large-patch14")
- prior_text_model = CLIPTextModelWithProjection.from_pretrained(
- "openai/clip-vit-large-patch14")
-
- prior_scheduler = UnCLIPScheduler.from_pretrained(
- karlo_model, subfolder="prior_scheduler")
- prior_scheduler = DDPMScheduler.from_config(
- prior_scheduler.config)
+ prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
+
+ prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+ prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+ prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
+ prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
else:
- raise NotImplementedError(
- f"unknown prior for stable unclip model: {stable_unclip_prior}"
- )
+ raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
pipe = StableUnCLIPPipeline(
# prior components
@@ -1532,33 +1419,29 @@ def download_from_original_stable_diffusion_ckpt(
unet=unet,
scheduler=scheduler,
# vae
- vae=vae, )
+ vae=vae,
+ )
else:
- raise NotImplementedError(
- f"unknown `stable_unclip` type: {stable_unclip}")
+ raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
elif model_type == "PaintByExample":
vision_model = convert_paint_by_example_checkpoint(checkpoint)
- tokenizer = CLIPTokenizer.from_pretrained(
- "openai/clip-vit-large-patch14")
- feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-safety-checker")
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+ feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
pipe = PaintByExamplePipeline(
vae=vae,
image_encoder=vision_model,
unet=unet,
scheduler=scheduler,
safety_checker=None,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
elif model_type == "FrozenCLIPEmbedder":
text_model = convert_ldm_clip_checkpoint(checkpoint)
- tokenizer = CLIPTokenizer.from_pretrained(
- "openai/clip-vit-large-patch14")
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
if load_safety_checker:
- safety_checker = StableDiffusionSafetyChecker.from_pretrained(
- "CompVis/stable-diffusion-safety-checker")
- feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-safety-checker")
+ safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+ feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
else:
safety_checker = None
feature_extractor = None
@@ -1573,7 +1456,8 @@ def download_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- requires_safety_checker=load_safety_checker, )
+ requires_safety_checker=load_safety_checker,
+ )
else:
pipe = pipeline_class(
vae=vae,
@@ -1583,19 +1467,20 @@ def download_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- requires_safety_checker=load_safety_checker, )
+ requires_safety_checker=load_safety_checker,
+ )
else:
text_config = create_ldm_bert_config(original_config)
text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
- tokenizer = BertTokenizer.from_pretrained(
- "bert-base-uncased", model_max_length=77)
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
pipe = LDMTextToImagePipeline(
vqvae=vae,
bert=text_model,
tokenizer=tokenizer,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
if paddle_dtype is not None:
pipe.to(paddle_dtype=paddle_dtype)
@@ -1603,13 +1488,14 @@ def download_from_original_stable_diffusion_ckpt(
def download_controlnet_from_original_ckpt(
- checkpoint_path: str,
- original_config_file: str,
- image_size: int=512,
- extract_ema: bool=False,
- num_in_channels: Optional[int]=None,
- upcast_attention: Optional[bool]=None,
- no_unet_key: Optional[bool]=False, ) -> DiffusionPipeline:
+ checkpoint_path: str,
+ original_config_file: str,
+ image_size: int = 512,
+ extract_ema: bool = False,
+ num_in_channels: Optional[int] = None,
+ upcast_attention: Optional[bool] = None,
+ no_unet_key: Optional[bool] = False,
+) -> DiffusionPipeline:
if not is_omegaconf_available():
raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
@@ -1636,12 +1522,10 @@ def download_controlnet_from_original_ckpt(
original_config = OmegaConf.load(original_config_file)
if num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
if "control_stage_config" not in original_config.model.params:
- raise ValueError(
- "`control_stage_config` not present in original config")
+ raise ValueError("`control_stage_config` not present in original config")
controlnet_model = convert_controlnet_checkpoint(
checkpoint,
@@ -1650,6 +1534,7 @@ def download_controlnet_from_original_ckpt(
image_size,
upcast_attention,
extract_ema,
- no_unet_key, )
+ no_unet_key,
+ )
return controlnet_model
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
index fc8dfda8a0781..4a8c6336fd55d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
@@ -20,17 +20,32 @@
import numpy as np
import requests
-from paddlenlp.transformers import (BertTokenizer, CLIPFeatureExtractor,
- CLIPTextModel, CLIPTokenizer)
+from paddlenlp.transformers import (
+ BertTokenizer,
+ CLIPFeatureExtractor,
+ CLIPTextModel,
+ CLIPTokenizer,
+)
from ppdiffusers import (
- AutoencoderKL, ControlNetModel, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LDMTextToImagePipeline, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline,
- UNet2DConditionModel)
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LDMTextToImagePipeline,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionControlNetPipeline,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
- LDMBertConfig, LDMBertModel)
+ LDMBertConfig,
+ LDMBertModel,
+)
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from ...utils import is_omegaconf_available, logging
@@ -65,8 +80,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -82,8 +96,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -126,8 +139,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -135,21 +147,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
attention layers, and takes into account additional replacements that may arise.
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -157,13 +168,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = np.split(old_tensor, 3, axis=1)
checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -174,8 +183,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -185,8 +193,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -207,9 +214,7 @@ def conv_attn_to_linear(checkpoint):
checkpoint[key] = checkpoint[key][:, :, 0]
-def create_unet_diffusers_config(original_config,
- image_size: int,
- controlnet=False):
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
"""
Creates a config for the diffusers based on the config of the LDM model.
"""
@@ -220,34 +225,28 @@ def create_unet_diffusers_config(original_config,
vae_params = original_config.model.params.first_stage_config.params.ddconfig
- block_out_channels = [
- unet_params.model_channels * mult for mult in unet_params.channel_mult
- ]
+ block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
down_block_types = []
resolution = 1
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnDownBlock2D"
- if resolution in unet_params.attention_resolutions else
- "DownBlock2D")
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
down_block_types.append(block_type)
if i != len(block_out_channels) - 1:
resolution *= 2
up_block_types = []
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnUpBlock2D"
- if resolution in unet_params.attention_resolutions else
- "UpBlock2D")
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
up_block_types.append(block_type)
resolution //= 2
- vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+ vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
head_dim = unet_params.num_heads if "num_heads" in unet_params else None
- use_linear_projection = (unet_params.use_linear_in_transformer
- if "use_linear_in_transformer" in unet_params else
- False)
+ use_linear_projection = (
+ unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+ )
if use_linear_projection:
# stable diffusion 2-base-512 and 2-768
if head_dim is None:
@@ -262,9 +261,7 @@ def create_unet_diffusers_config(original_config,
assert "adm_in_channels" in unet_params
projection_class_embeddings_input_dim = unet_params.adm_in_channels
else:
- raise NotImplementedError(
- f"Unknown conditional unet num_classes config: {unet_params.num_classes}"
- )
+ raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
config = dict(
sample_size=image_size // vae_scale_factor,
@@ -304,7 +301,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
up_block_types=tuple(up_block_types),
block_out_channels=tuple(block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
@@ -330,15 +328,12 @@ def create_ldm_bert_config(original_config):
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
- pad_token_id=0, )
+ pad_token_id=0,
+ )
return LDMBertConfig(**config)
-def convert_ldm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False,
- controlnet=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -362,8 +357,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith(unet_key[:-1]):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
if sum(k.startswith("model_ema") for k in keys) > 100:
print(
@@ -377,34 +371,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint = {}
- new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
- "time_embed.0.weight"]
- new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
- "time_embed.0.bias"]
- new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
- "time_embed.2.weight"]
- new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
- "time_embed.2.bias"]
+ new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+ new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+ new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+ new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
if config["class_embed_type"] is None:
# No parameters to port
...
- elif (config["class_embed_type"] == "timestep" or
- config["class_embed_type"] == "projection"):
- new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[
- "label_emb.0.0.weight"]
- new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[
- "label_emb.0.0.bias"]
- new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[
- "label_emb.0.2.weight"]
- new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[
- "label_emb.0.2.bias"]
+ elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+ new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+ new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+ new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+ new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
else:
- raise NotImplementedError(
- f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+ raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
- new_checkpoint["conv_in.weight"] = unet_state_dict[
- "input_blocks.0.0.weight"]
+ new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
if not controlnet:
@@ -414,35 +397,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
# Retrieves the keys for the input blocks only
- num_input_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "input_blocks" in layer
- })
+ num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
input_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
for layer_id in range(num_input_blocks)
}
# Retrieves the keys for the middle blocks only
- num_middle_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "middle_block" in layer
- })
+ num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
middle_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
for layer_id in range(num_middle_blocks)
}
# Retrieves the keys for the output blocks only
- num_output_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "output_blocks" in layer
- })
+ num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
output_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
for layer_id in range(num_output_blocks)
}
@@ -451,21 +422,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
resnets = [
- key for key in input_blocks[i]
- if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
- key
- ]
- attentions = [
- key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+ key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
]
+ attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.weight")
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.bias")
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.weight"
+ )
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.bias"
+ )
paths = renew_resnet_paths(resnets)
meta_path = {
@@ -477,7 +444,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if len(attentions):
paths = renew_attention_paths(attentions)
@@ -490,19 +458,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
resnet_0 = middle_blocks[0]
attentions = middle_blocks[1]
resnet_1 = middle_blocks[2]
resnet_0_paths = renew_resnet_paths(resnet_0)
- assign_to_checkpoint(
- resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
resnet_1_paths = renew_resnet_paths(resnet_1)
- assign_to_checkpoint(
- resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
attentions_paths = renew_attention_paths(attentions)
meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -511,14 +478,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
for i in range(num_output_blocks):
block_id = i // (config["layers_per_block"] + 1)
layer_in_block_id = i % (config["layers_per_block"] + 1)
- output_block_layers = [
- shave_segments(name, 2) for name in output_blocks[i]
- ]
+ output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
output_block_list = {}
for layer in output_block_layers:
@@ -529,12 +495,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
output_block_list[layer_id] = [layer_name]
if len(output_block_list) > 1:
- resnets = [
- key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
- ]
- attentions = [
- key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
- ]
+ resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+ attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
resnet_0_paths = renew_resnet_paths(resnets)
paths = renew_resnet_paths(resnets)
@@ -548,22 +510,19 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- output_block_list = {
- k: sorted(v)
- for k, v in output_block_list.items()
- }
+ output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
if ["conv.bias", "conv.weight"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.bias", "conv.weight"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
@@ -573,27 +532,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
paths = renew_attention_paths(attentions)
meta_path = {
"old": f"output_blocks.{i}.1",
- "new":
- f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+ "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
}
assign_to_checkpoint(
paths,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
else:
- resnet_0_paths = renew_resnet_paths(
- output_block_layers, n_shave_prefix_segments=1)
+ resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
for path in resnet_0_paths:
old_path = ".".join(["output_blocks", str(i), path["old"]])
- new_path = ".".join([
- "up_blocks",
- str(block_id),
- "resnets",
- str(layer_in_block_id),
- path["new"],
- ])
+ new_path = ".".join(
+ [
+ "up_blocks",
+ str(block_id),
+ "resnets",
+ str(layer_in_block_id),
+ path["new"],
+ ]
+ )
new_checkpoint[new_path] = unet_state_dict[old_path]
@@ -602,48 +562,42 @@ def convert_ldm_unet_checkpoint(checkpoint,
orig_index = 0
- new_checkpoint[
- "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
orig_index += 2
diffusers_index = 0
while diffusers_index < 6:
- new_checkpoint[
- f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
diffusers_index += 1
orig_index += 2
- new_checkpoint[
- "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.weight")
- new_checkpoint[
- "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
- f"input_hint_block.{orig_index}.bias")
+ new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.weight"
+ )
+ new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+ f"input_hint_block.{orig_index}.bias"
+ )
# down blocks
for i in range(num_input_blocks):
- new_checkpoint[
- f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(
- f"zero_convs.{i}.0.weight")
- new_checkpoint[
- f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(
- f"zero_convs.{i}.0.bias")
+ new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+ new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
# mid block
- new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop(
- "middle_block_out.0.weight")
- new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop(
- "middle_block_out.0.bias")
+ new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+ new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
return new_checkpoint
@@ -659,107 +613,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -767,58 +688,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -826,13 +739,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
import paddle.nn as nn
need_transpose = []
@@ -858,52 +771,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
new_checkpoint = {}
- new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
- "transformer.token_emb.weight"]
- new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
- "transformer.pos_emb.emb.weight"]
+ new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
+ new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
for i in range(config.encoder_layers):
double_i = 2 * i
double_i_plus1 = 2 * i + 1
# convert norm
new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.weight"]
+ f"transformer.attn_layers.layers.{double_i}.0.weight"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.0.bias"]
-
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T
- new_checkpoint[
- f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"]
+ f"transformer.attn_layers.layers.{double_i}.0.bias"
+ ]
+
+ new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
+ ].T
+ new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+ f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+ ]
new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
+ ].T
new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"]
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+ ]
new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
+ ].T
new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
- f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T
+ f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+ ].T
- new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
- "transformer.norm.weight"]
- new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
- "transformer.norm.bias"]
+ new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
+ new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
ldmbert = LDMBertModel(config)
ldmbert.eval()
ldmbert.load_dict(new_checkpoint)
@@ -911,8 +828,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
def convert_ldm_clip_checkpoint(checkpoint):
- text_model = CLIPTextModel.from_pretrained(
- "CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
+ text_model = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
text_model.eval()
keys = list(checkpoint.keys())
@@ -921,12 +837,10 @@ def convert_ldm_clip_checkpoint(checkpoint):
for key in keys:
if key.startswith("cond_stage_model.transformer"):
- text_model_dict[key[len(
- "cond_stage_model.transformer."):]] = checkpoint[key]
+ text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
if len(text_model_dict) > 0:
- text_model.load_dict(
- CLIPTextModel.smart_convert(text_model_dict, text_model))
+ text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
return text_model
@@ -934,14 +848,14 @@ def convert_ldm_clip_checkpoint(checkpoint):
textenc_conversion_lst = [
(
"cond_stage_model.model.positional_embedding",
- "text_model.embeddings.position_embedding.weight", ),
+ "text_model.embeddings.position_embedding.weight",
+ ),
(
"cond_stage_model.model.token_embedding.weight",
- "text_model.embeddings.token_embedding.weight", ),
- ("cond_stage_model.model.ln_final.weight",
- "text_model.final_layer_norm.weight"),
- ("cond_stage_model.model.ln_final.bias",
- "text_model.final_layer_norm.bias"),
+ "text_model.embeddings.token_embedding.weight",
+ ),
+ ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+ ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
]
textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
@@ -956,83 +870,73 @@ def convert_ldm_clip_checkpoint(checkpoint):
("ln_final.", "transformer.text_model.final_layer_norm."),
(
"token_embedding.weight",
- "transformer.text_model.embeddings.token_embedding.weight", ),
+ "transformer.text_model.embeddings.token_embedding.weight",
+ ),
(
"positional_embedding",
- "transformer.text_model.embeddings.position_embedding.weight", ),
+ "transformer.text_model.embeddings.position_embedding.weight",
+ ),
]
protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
textenc_pattern = re.compile("|".join(protected.keys()))
def convert_open_clip_checkpoint(checkpoint):
- text_model = CLIPTextModel.from_pretrained(
- "stabilityai/stable-diffusion-2", subfolder="text_encoder")
+ text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
text_model.eval()
keys = list(checkpoint.keys())
text_model_dict = {}
if "cond_stage_model.model.text_projection" in checkpoint:
- d_model = int(checkpoint["cond_stage_model.model.text_projection"]
- .shape[0])
+ d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
else:
d_model = 1024
for key in keys:
- if ("resblocks.23" in
- key): # Diffusers drops the final layer and only uses the penultimate layer
+ if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer
continue
if key in textenc_conversion_map:
text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
if key.startswith("cond_stage_model.model.transformer."):
- new_key = key[len("cond_stage_model.model.transformer."):]
+ new_key = key[len("cond_stage_model.model.transformer.") :]
if new_key.endswith(".in_proj_weight"):
- new_key = new_key[:-len(".in_proj_weight")]
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
- text_model_dict[new_key + ".q_proj.weight"] = checkpoint[
- key][:d_model, :]
- text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][
- d_model:d_model * 2, :]
- text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][
- d_model * 2:, :]
+ new_key = new_key[: -len(".in_proj_weight")]
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+ text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+ text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+ text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
elif new_key.endswith(".in_proj_bias"):
- new_key = new_key[:-len(".in_proj_bias")]
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
- text_model_dict[new_key + ".q_proj.bias"] = checkpoint[
- key][:d_model]
- text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][
- d_model:d_model * 2]
- text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][
- d_model * 2:]
+ new_key = new_key[: -len(".in_proj_bias")]
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+ text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+ text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+ text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
else:
- new_key = textenc_pattern.sub(
- lambda m: protected[re.escape(m.group(0))], new_key)
+ new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
text_model_dict[new_key] = checkpoint[key]
if len(text_model_dict) > 0:
- text_model.load_dict(
- CLIPTextModel.smart_convert(text_model_dict, text_model))
+ text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
return text_model
def load_pipeline_from_original_stable_diffusion_ckpt(
- checkpoint_path: str,
- original_config_file: str=None,
- image_size: int=512,
- prediction_type: str=None,
- model_type: str=None,
- extract_ema: bool=False,
- scheduler_type: str="pndm",
- num_in_channels: Optional[int]=None,
- upcast_attention: Optional[bool]=None,
- paddle_dtype: Optional[bool]=None,
- requires_safety_checker: bool=False,
- controlnet: Optional[bool]=None,
- cls=None,
- **kwargs, ) -> StableDiffusionPipeline:
+ checkpoint_path: str,
+ original_config_file: str = None,
+ image_size: int = 512,
+ prediction_type: str = None,
+ model_type: str = None,
+ extract_ema: bool = False,
+ scheduler_type: str = "pndm",
+ num_in_channels: Optional[int] = None,
+ upcast_attention: Optional[bool] = None,
+ paddle_dtype: Optional[bool] = None,
+ requires_safety_checker: bool = False,
+ controlnet: Optional[bool] = None,
+ cls=None,
+ **kwargs,
+) -> StableDiffusionPipeline:
"""
Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
config file.
@@ -1079,8 +983,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
from omegaconf import OmegaConf
- checkpoint = smart_load(
- checkpoint_path, return_numpy=True, return_global_step=True)
+ checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
global_step = int(checkpoint.pop("global_step", -1))
@@ -1106,8 +1009,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
original_config_file = os.path.join(tmpdir, "inference.yaml")
- if key_name in checkpoint and checkpoint[key_name].shape[
- -1] == 1024:
+ if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
if not os.path.isfile("v2-inference-v.yaml"):
# model_type = "v2"
r = requests.get(
@@ -1129,11 +1031,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
original_config = OmegaConf.load(original_config_file)
if num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
- if ("parameterization" in original_config["model"]["params"] and
- original_config["model"]["params"]["parameterization"] == "v"):
+ if (
+ "parameterization" in original_config["model"]["params"]
+ and original_config["model"]["params"]["parameterization"] == "v"
+ ):
if prediction_type is None:
# NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
# as it relies on a brittle global step parameter here
@@ -1160,7 +1063,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
steps_offset=1,
clip_sample=False,
set_alpha_to_one=False,
- prediction_type=prediction_type, )
+ prediction_type=prediction_type,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -1175,8 +1079,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
elif scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif scheduler_type == "ddim":
@@ -1185,44 +1088,35 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
# Convert the UNet2DConditionModel model.
- unet_config = create_unet_diffusers_config(
- original_config, image_size=image_size)
+ unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
unet_config["upcast_attention"] = upcast_attention
unet = UNet2DConditionModel(**unet_config)
unet.eval()
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
- checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema)
- unet.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(unet,
- converted_unet_checkpoint))
+ checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+ )
+ unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
# Convert the VAE model.
- vae_config = create_vae_diffusers_config(
- original_config, image_size=image_size)
- converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+ converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config)
vae.eval()
- vae.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(vae,
- converted_vae_checkpoint))
+ vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
# Convert the text model.
if model_type is None:
- model_type = original_config.model.params.cond_stage_config.target.split(
- ".")[-1]
- logger.debug(
- f"no `model_type` given, `model_type` inferred as: {model_type}")
+ model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+ logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
if controlnet is None:
controlnet = "control_stage_config" in original_config.model.params
if model_type == "FrozenOpenCLIPEmbedder":
text_model = convert_open_clip_checkpoint(checkpoint)
- tokenizer = CLIPTokenizer.from_pretrained(
- "stabilityai/stable-diffusion-2/tokenizer")
+ tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
if paddle_dtype is not None:
vae.to(dtype=paddle_dtype)
@@ -1231,8 +1125,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
if controlnet:
# Convert the ControlNetModel model.
- ctrlnet_config = create_unet_diffusers_config(
- original_config, image_size=image_size, controlnet=True)
+ ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
ctrlnet_config["upcast_attention"] = upcast_attention
ctrlnet_config.pop("sample_size")
@@ -1245,10 +1138,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
ctrlnet_config,
path=checkpoint_path,
extract_ema=extract_ema,
- controlnet=True, )
+ controlnet=True,
+ )
controlnet_model.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(
- controlnet_model, converted_ctrl_checkpoint))
+ convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
+ )
if paddle_dtype is not None:
controlnet_model.to(dtype=paddle_dtype)
@@ -1262,7 +1156,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
else:
pipe = cls(
vae=vae,
@@ -1272,17 +1167,19 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
elif model_type == "FrozenCLIPEmbedder":
text_model = convert_ldm_clip_checkpoint(checkpoint)
- tokenizer = CLIPTokenizer.from_pretrained(
- "CompVis/stable-diffusion-v1-4/tokenizer")
+ tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4/tokenizer")
if requires_safety_checker:
safety_checker = StableDiffusionSafetyChecker.from_pretrained(
- "CompVis/stable-diffusion-v1-4", subfolder="safety_checker")
+ "CompVis/stable-diffusion-v1-4", subfolder="safety_checker"
+ )
feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor")
+ "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor"
+ )
else:
safety_checker = feature_extractor = None
@@ -1295,8 +1192,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
if controlnet:
# Convert the ControlNetModel model.
- ctrlnet_config = create_unet_diffusers_config(
- original_config, image_size=image_size, controlnet=True)
+ ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
ctrlnet_config["upcast_attention"] = upcast_attention
ctrlnet_config.pop("sample_size")
@@ -1309,10 +1205,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
ctrlnet_config,
path=checkpoint_path,
extract_ema=extract_ema,
- controlnet=True, )
+ controlnet=True,
+ )
controlnet_model.load_dict(
- convert_diffusers_vae_unet_to_ppdiffusers(
- controlnet_model, converted_ctrl_checkpoint))
+ convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
+ )
if paddle_dtype is not None:
controlnet_model.to(dtype=paddle_dtype)
@@ -1326,7 +1223,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- requires_safety_checker=requires_safety_checker, )
+ requires_safety_checker=requires_safety_checker,
+ )
else:
pipe = cls(
vae=vae,
@@ -1336,12 +1234,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
- requires_safety_checker=requires_safety_checker, )
+ requires_safety_checker=requires_safety_checker,
+ )
else:
text_config = create_ldm_bert_config(original_config)
text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
- tokenizer = BertTokenizer.from_pretrained(
- "bert-base-uncased", model_max_length=77)
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
if paddle_dtype is not None:
vae.to(dtype=paddle_dtype)
text_model.to(dtype=paddle_dtype)
@@ -1351,6 +1249,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
bert=text_model,
tokenizer=tokenizer,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
return pipe
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
index 29d9afb9eef79..5b406410e76aa 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
@@ -26,9 +26,15 @@
from paddle.distributed.fleet.utils import recompute
from paddlenlp.transformers.activations import ACT2FN
from paddlenlp.transformers.clip.configuration import (
- CLIPConfig, CLIPTextConfig, CLIPVisionConfig)
+ CLIPConfig,
+ CLIPTextConfig,
+ CLIPVisionConfig,
+)
from paddlenlp.transformers.model_outputs import (
- BaseModelOutput, BaseModelOutputWithPooling, ModelOutput)
+ BaseModelOutput,
+ BaseModelOutputWithPooling,
+ ModelOutput,
+)
from paddlenlp.transformers.model_utils import PretrainedModel
from ppdiffusers.initializer import normal_, ones_
@@ -39,7 +45,7 @@
]
-def finfo(dtype: paddle.dtype=None):
+def finfo(dtype: paddle.dtype = None):
if dtype is None:
dtype = paddle.get_default_dtype()
@@ -58,10 +64,7 @@ class BFloatFInfo:
def Parameter(data: paddle.Tensor, requires_grad=True):
- tensor = paddle.create_parameter(
- data.shape,
- dtype=data.dtype,
- default_initializer=nn.initializer.Assign(data))
+ tensor = paddle.create_parameter(data.shape, dtype=data.dtype, default_initializer=nn.initializer.Assign(data))
if not requires_grad:
tensor.stop_gradient = True
return tensor
@@ -74,13 +77,14 @@ class TorchLinear(nn.Layer):
"""
def __init__(
- self,
- in_features,
- out_features,
- weight_attr=None,
- bias_attr=None,
- name=None,
- bias=None, ):
+ self,
+ in_features,
+ out_features,
+ weight_attr=None,
+ bias_attr=None,
+ name=None,
+ bias=None,
+ ):
super().__init__()
self._dtype = self._helper.get_default_dtype()
self._weight_attr = weight_attr
@@ -96,23 +100,25 @@ def __init__(
], # regular linear has shape [in_features, out_features]
attr=self._weight_attr,
dtype=self._dtype,
- is_bias=False, )
+ is_bias=False,
+ )
self.bias = self.create_parameter(
shape=[out_features],
attr=self._bias_attr,
dtype=self._dtype,
- is_bias=True, )
+ is_bias=True,
+ )
self.name = name
def forward(self, input):
- out = F.linear(
- x=input, weight=self.weight.T, bias=self.bias, name=self.name)
+ out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name)
return out
def extra_repr(self):
name_str = ", name={}".format(self.name) if self.name else ""
return "in_features={}, out_features={}, dtype={}{}".format(
- self.weight.shape[1], self.weight.shape[0], self._dtype, name_str)
+ self.weight.shape[1], self.weight.shape[0], self._dtype, name_str
+ )
def str2bool(v):
@@ -139,20 +145,18 @@ def masked_fill(x, mask, value):
return paddle.where(mask, y, x)
-def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int]=None):
+def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.shape
tgt_len = tgt_len if tgt_len is not None else src_len
- expanded_mask = (
- mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype))
+ expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
inverted_mask = 1.0 - expanded_mask
- return masked_fill(inverted_mask,
- inverted_mask.cast(paddle.bool), finfo(dtype).min)
+ return masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), finfo(dtype).min)
# contrastive loss function, adapted from
@@ -256,9 +260,10 @@ class HFCLIPOutput(ModelOutput):
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
- return tuple(self[k]
- if k not in ["text_model_output", "vision_model_output"]
- else getattr(self, k).to_tuple() for k in self.keys())
+ return tuple(
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+ for k in self.keys()
+ )
class HFCLIPVisionEmbeddings(nn.Layer):
@@ -269,30 +274,29 @@ def __init__(self, config: CLIPVisionConfig):
self.image_size = config.image_size
self.patch_size = config.patch_size
- self.class_embedding = Parameter(paddle.randn((self.embed_dim, )))
+ self.class_embedding = Parameter(paddle.randn((self.embed_dim,)))
self.patch_embedding = nn.Conv2D(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
- bias_attr=False, )
+ bias_attr=False,
+ )
- self.num_patches = (self.image_size // self.patch_size)**2
+ self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
- self.position_embedding = nn.Embedding(self.num_positions,
- self.embed_dim)
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer(
"position_ids",
- paddle.arange(self.num_positions).expand(
- (1, -1), dtype="int64"),
- persistable=False, )
+ paddle.arange(self.num_positions).expand((1, -1), dtype="int64"),
+ persistable=False,
+ )
def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
- patch_embeds = self.patch_embedding(
- pixel_values.cast(target_dtype)) # shape = [*, width, grid, grid]
+ patch_embeds = self.patch_embedding(pixel_values.cast(target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
class_embeds = self.class_embedding.expand([batch_size, 1, -1])
@@ -307,23 +311,22 @@ def __init__(self, config: CLIPTextConfig):
embed_dim = config.hidden_size
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
- self.position_embedding = nn.Embedding(config.max_position_embeddings,
- embed_dim)
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids",
- paddle.arange(
- config.max_position_embeddings, dtype="int64").expand((1, -1)),
- persistable=False, )
+ paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)),
+ persistable=False,
+ )
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- inputs_embeds: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
- seq_length = (input_ids.shape[-1]
- if input_ids is not None else inputs_embeds.shape[-2])
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ inputs_embeds: Optional[paddle.Tensor] = None,
+ ) -> paddle.Tensor:
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
@@ -349,7 +352,8 @@ def __init__(self, config):
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
- f" {self.num_heads}).")
+ f" {self.num_heads})."
+ )
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
@@ -359,18 +363,15 @@ def __init__(self, config):
self.out_proj = LinearClass(self.embed_dim, self.embed_dim)
def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
- return tensor.reshape(
- [bsz, seq_len, self.num_heads, self.head_dim]).transpose(
- [0, 2, 1, 3])
+ return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
def forward(
- self,
- hidden_states: paddle.Tensor,
- attention_mask: Optional[paddle.Tensor]=None,
- causal_attention_mask: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=False, ) -> Tuple[
- paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[
- paddle.Tensor]]]:
+ self,
+ hidden_states: paddle.Tensor,
+ attention_mask: Optional[paddle.Tensor] = None,
+ causal_attention_mask: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
bsz, tgt_len, embed_dim = hidden_states.shape
@@ -381,8 +382,7 @@ def forward(
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
proj_shape = (bsz * self.num_heads, -1, self.head_dim)
- query_states = self._shape(query_states, tgt_len,
- bsz).reshape(proj_shape)
+ query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
key_states = key_states.reshape(proj_shape)
value_states = value_states.reshape(proj_shape)
@@ -392,29 +392,26 @@ def forward(
if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
raise ValueError(
f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, src_len]}, but is"
- f" {attn_weights.shape}")
+ f" {attn_weights.shape}"
+ )
# apply the causal_attention_mask first
if causal_attention_mask is not None:
if causal_attention_mask.shape != [bsz, 1, tgt_len, src_len]:
raise ValueError(
f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is"
- f" {causal_attention_mask.shape}")
- attn_weights = (
- attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) +
- causal_attention_mask)
- attn_weights = attn_weights.reshape(
- [bsz * self.num_heads, tgt_len, src_len])
+ f" {causal_attention_mask.shape}"
+ )
+ attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + causal_attention_mask
+ attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
if attention_mask is not None:
if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
raise ValueError(
f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
)
- attn_weights = (attn_weights.reshape(
- [bsz, self.num_heads, tgt_len, src_len]) + attention_mask)
- attn_weights = attn_weights.reshape(
- [bsz * self.num_heads, tgt_len, src_len])
+ attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
+ attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
attn_weights = F.softmax(attn_weights, axis=-1)
@@ -423,25 +420,22 @@ def forward(
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
- attn_weights_reshaped = attn_weights.reshape(
- [bsz, self.num_heads, tgt_len, src_len])
- attn_weights = attn_weights_reshaped.reshape(
- [bsz * self.num_heads, tgt_len, src_len])
+ attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
+ attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
else:
attn_weights_reshaped = None
- attn_probs = F.dropout(
- attn_weights, p=self.dropout, training=self.training)
+ attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = paddle.matmul(attn_probs, value_states)
if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
raise ValueError(
f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
- f" {attn_output.shape}")
+ f" {attn_output.shape}"
+ )
- attn_output = attn_output.reshape(
- [bsz, self.num_heads, tgt_len, self.head_dim])
+ attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
attn_output = attn_output.transpose([0, 2, 1, 3])
attn_output = attn_output.reshape([bsz, tgt_len, embed_dim])
@@ -470,18 +464,17 @@ def __init__(self, config: CLIPTextConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = HFCLIPAttention(config)
- self.layer_norm1 = nn.LayerNorm(
- self.embed_dim, epsilon=config.layer_norm_eps)
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
self.mlp = HFCLIPMLP(config)
- self.layer_norm2 = nn.LayerNorm(
- self.embed_dim, epsilon=config.layer_norm_eps)
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
def forward(
- self,
- hidden_states: paddle.Tensor,
- attention_mask: paddle.Tensor,
- causal_attention_mask: paddle.Tensor,
- output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+ self,
+ hidden_states: paddle.Tensor,
+ attention_mask: paddle.Tensor,
+ causal_attention_mask: paddle.Tensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[paddle.Tensor]:
"""
Args:
hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -499,7 +492,8 @@ def forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
- output_attentions=output_attentions, )
+ output_attentions=output_attentions,
+ )
hidden_states = residual + hidden_states
residual = hidden_states
@@ -507,10 +501,10 @@ def forward(
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
- outputs = (hidden_states, )
+ outputs = (hidden_states,)
if output_attentions:
- outputs += (attn_weights, )
+ outputs += (attn_weights,)
return outputs
@@ -531,24 +525,21 @@ def _init_weights(self, module):
factor = self.config.initializer_factor
if isinstance(module, HFCLIPTextEmbeddings):
normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
- normal_(
- module.position_embedding.weight, mean=0.0, std=factor * 0.02)
+ normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
elif isinstance(module, HFCLIPVisionEmbeddings):
factor = self.config.initializer_factor
- normal_(
- module.class_embedding,
- mean=0.0,
- std=module.embed_dim**-0.5 * factor)
+ normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
normal_(
module.patch_embedding.weight,
- std=module.config.initializer_range * factor, )
+ std=module.config.initializer_range * factor,
+ )
normal_(
module.position_embedding.weight,
- std=module.config.initializer_range * factor, )
+ std=module.config.initializer_range * factor,
+ )
elif isinstance(module, HFCLIPAttention):
factor = self.config.initializer_factor
- in_proj_std = ((module.embed_dim**-0.5) * (
- (2 * module.config.num_hidden_layers)**-0.5) * factor)
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
normal_(module.q_proj.weight, std=in_proj_std)
normal_(module.k_proj.weight, std=in_proj_std)
@@ -556,30 +547,31 @@ def _init_weights(self, module):
normal_(module.out_proj.weight, std=out_proj_std)
elif isinstance(module, HFCLIPMLP):
factor = self.config.initializer_factor
- in_proj_std = ((module.config.hidden_size**-0.5) * (
- (2 * module.config.num_hidden_layers)**-0.5) * factor)
- fc_std = (2 * module.config.hidden_size)**-0.5 * factor
+ in_proj_std = (
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+ )
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
normal_(module.fc1.weight, std=fc_std)
normal_(module.fc2.weight, std=in_proj_std)
elif isinstance(module, HFCLIPModel):
normal_(
module.text_projection.weight,
- std=module.text_embed_dim
- **-0.5 * self.config.initializer_factor, )
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+ )
normal_(
module.visual_projection.weight,
- std=module.vision_embed_dim
- **-0.5 * self.config.initializer_factor, )
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+ )
elif isinstance(module, HFCLIPVisionModelWithProjection):
normal_(
module.visual_projection.weight,
- std=self.config.hidden_size
- **-0.5 * self.config.initializer_factor, )
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
elif isinstance(module, HFCLIPTextModelWithProjection):
normal_(
module.text_projection.weight,
- std=self.config.hidden_size
- **-0.5 * self.config.initializer_factor, )
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+ )
if isinstance(module, nn.LayerNorm):
module.bias.zero_()
@@ -599,9 +591,7 @@ def gradient_checkpointing_enable(self):
activations".
"""
if not self.supports_gradient_checkpointing:
- raise ValueError(
- f"{self.__class__.__name__} does not support gradient checkpointing."
- )
+ raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
self.apply(partial(self._set_gradient_checkpointing, value=True))
def gradient_checkpointing_disable(self):
@@ -627,8 +617,7 @@ def register_load_torch_hook(self, function=None):
def map_from(module, state_dict, *args, **kwargs):
if state_dict.pop("is_torch_weight", False):
need_transposed = []
- for name, layer in module.named_sublayers(
- include_self=True):
+ for name, layer in module.named_sublayers(include_self=True):
if isinstance(layer, nn.Linear):
need_transposed.append(name + ".weight")
module.need_transposed = need_transposed
@@ -637,8 +626,7 @@ def map_from(module, state_dict, *args, **kwargs):
else:
map_from = function
- self.load_torch_hook = self.register_load_state_dict_pre_hook(
- map_from, with_module=True)
+ self.load_torch_hook = self.register_load_state_dict_pre_hook(map_from, with_module=True)
return self.load_torch_hook
def remove_load_torch_hook(self):
@@ -651,7 +639,8 @@ def to(self=None, device=None, dtype=None, blocking=None):
dtype=dtype,
blocking=blocking,
include_sublayers=True,
- floating_only=True, )
+ floating_only=True,
+ )
class HFCLIPEncoder(nn.Layer):
@@ -666,20 +655,18 @@ class HFCLIPEncoder(nn.Layer):
def __init__(self, config: CLIPConfig):
super().__init__()
self.config = config
- self.layers = nn.LayerList([
- HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)
- ])
+ self.layers = nn.LayerList([HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
- self,
- inputs_embeds,
- attention_mask: Optional[paddle.Tensor]=None,
- causal_attention_mask: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[Tuple,
- BaseModelOutput]:
+ self,
+ inputs_embeds,
+ attention_mask: Optional[paddle.Tensor] = None,
+ causal_attention_mask: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -709,13 +696,11 @@ def forward(
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
@@ -723,7 +708,7 @@ def forward(
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
- encoder_states = encoder_states + (hidden_states, )
+ encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
@@ -736,30 +721,31 @@ def custom_forward(*inputs):
create_custom_forward(encoder_layer),
hidden_states,
attention_mask,
- causal_attention_mask, )
+ causal_attention_mask,
+ )
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
- output_attentions=output_attentions, )
+ output_attentions=output_attentions,
+ )
hidden_states = layer_outputs[0]
if output_attentions:
- all_attentions = all_attentions + (layer_outputs[1], )
+ all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
- encoder_states = encoder_states + (hidden_states, )
+ encoder_states = encoder_states + (hidden_states,)
if not return_dict:
- return tuple(
- v for v in [hidden_states, encoder_states, all_attentions]
- if v is not None)
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_states,
- attentions=all_attentions, )
+ attentions=all_attentions,
+ )
# def _make_causal_mask(
@@ -786,31 +772,28 @@ def __init__(self, config: CLIPTextConfig):
embed_dim = config.hidden_size
self.embeddings = HFCLIPTextEmbeddings(config)
self.encoder = HFCLIPEncoder(config)
- self.final_layer_norm = nn.LayerNorm(
- embed_dim, epsilon=config.layer_norm_eps)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
# For `pooled_output` computation
self.eos_token_id = config.eos_token_id
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[
- Tuple, BaseModelOutputWithPooling]:
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is None:
raise ValueError("You have to specify either input_ids")
@@ -818,8 +801,7 @@ def forward(
input_shape = input_ids.shape
input_ids = input_ids.reshape([-1, input_shape[-1]])
- hidden_states = self.embeddings(
- input_ids=input_ids, position_ids=position_ids)
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
bsz, seq_len = input_shape
# CLIP's text model uses causal mask, prepare it here.
@@ -828,7 +810,8 @@ def forward(
causal_attention_mask = self._build_causal_attention_mask(
bsz,
seq_len,
- hidden_states.dtype, )
+ hidden_states.dtype,
+ )
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -840,7 +823,8 @@ def forward(
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.final_layer_norm(last_hidden_state)
@@ -855,24 +839,24 @@ def forward(
pooled_output = last_hidden_state.gather_nd(
paddle.stack(
[
- paddle.arange(
- last_hidden_state.shape[0], dtype="int32"),
- input_ids.argmax(
- -1, dtype="int32"),
+ paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+ input_ids.argmax(-1, dtype="int32"),
],
- axis=-1, ))
+ axis=-1,
+ )
+ )
else:
# The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
pooled_output = last_hidden_state.gather_nd(
paddle.stack(
[
- paddle.arange(
- last_hidden_state.shape[0], dtype="int32"),
- (input_ids == self.eos_token_id).cast("int32").argmax(
- axis=-1, dtype="int32"),
+ paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+ (input_ids == self.eos_token_id).cast("int32").argmax(axis=-1, dtype="int32"),
],
- axis=-1, ))
+ axis=-1,
+ )
+ )
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -881,12 +865,14 @@ def forward(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions, )
+ attentions=encoder_outputs.attentions,
+ )
def _build_causal_attention_mask(self, bsz, seq_len, dtype):
mask = paddle.triu(
paddle.full((bsz, 1, seq_len, seq_len), finfo(dtype).min),
- diagonal=1, )
+ diagonal=1,
+ )
return mask
@@ -908,14 +894,14 @@ def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[
- Tuple, BaseModelOutputWithPooling]:
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
@@ -933,8 +919,7 @@ def forward(
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```"""
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
return self.text_model(
input_ids=input_ids,
@@ -942,7 +927,8 @@ def forward(
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
class HFCLIPVisionTransformer(nn.Layer):
@@ -952,30 +938,26 @@ def __init__(self, config: CLIPVisionConfig):
embed_dim = config.hidden_size
self.embeddings = HFCLIPVisionEmbeddings(config)
- self.pre_layrnorm = nn.LayerNorm(
- embed_dim, epsilon=config.layer_norm_eps)
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
self.encoder = HFCLIPEncoder(config)
- self.post_layernorm = nn.LayerNorm(
- embed_dim, epsilon=config.layer_norm_eps)
+ self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
def forward(
- self,
- pixel_values: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[
- Tuple, BaseModelOutputWithPooling]:
+ self,
+ pixel_values: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
@@ -987,7 +969,8 @@ def forward(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
@@ -1000,7 +983,8 @@ def forward(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions, )
+ attentions=encoder_outputs.attentions,
+ )
class HFCLIPVisionModel(HFCLIPPretrainedModel):
@@ -1017,12 +1001,12 @@ def get_input_embeddings(self) -> nn.Layer:
return self.vision_model.embeddings.patch_embedding
def forward(
- self,
- pixel_values: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[
- Tuple, BaseModelOutputWithPooling]:
+ self,
+ pixel_values: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
@@ -1045,14 +1029,14 @@ def forward(
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
class HFCLIPModel(HFCLIPPretrainedModel):
@@ -1064,12 +1048,14 @@ def __init__(self, config: CLIPConfig):
if not isinstance(config.text_config, CLIPTextConfig):
raise ValueError(
"config.text_config is expected to be of type CLIPTextConfig but is of type"
- f" {type(config.text_config)}.")
+ f" {type(config.text_config)}."
+ )
if not isinstance(config.vision_config, CLIPVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
- f" {type(config.vision_config)}.")
+ f" {type(config.vision_config)}."
+ )
text_config = config.text_config
vision_config = config.vision_config
@@ -1081,24 +1067,22 @@ def __init__(self, config: CLIPConfig):
self.text_model = HFCLIPTextTransformer(text_config)
self.vision_model = HFCLIPVisionTransformer(vision_config)
- self.visual_projection = LinearClass(
- self.vision_embed_dim, self.projection_dim, bias_attr=False)
- self.text_projection = LinearClass(
- self.text_embed_dim, self.projection_dim, bias_attr=False)
- self.logit_scale = Parameter(
- paddle.to_tensor(self.config.logit_scale_init_value))
+ self.visual_projection = LinearClass(self.vision_embed_dim, self.projection_dim, bias_attr=False)
+ self.text_projection = LinearClass(self.text_embed_dim, self.projection_dim, bias_attr=False)
+ self.logit_scale = Parameter(paddle.to_tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
def get_text_features(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> paddle.Tensor:
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> paddle.Tensor:
r"""
Returns:
text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
@@ -1116,13 +1100,11 @@ def get_text_features(
>>> text_features = model.get_text_features(**inputs)
```"""
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.text_model(
input_ids=input_ids,
@@ -1130,7 +1112,8 @@ def get_text_features(
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
pooled_output = text_outputs[1]
text_features = self.text_projection(pooled_output)
@@ -1138,11 +1121,12 @@ def get_text_features(
return text_features
def get_image_features(
- self,
- pixel_values: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> paddle.Tensor:
+ self,
+ pixel_values: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> paddle.Tensor:
r"""
Returns:
image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
@@ -1166,19 +1150,18 @@ def get_image_features(
>>> image_features = model.get_image_features(**inputs)
```"""
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
pooled_output = vision_outputs[1] # pooled_output
image_features = self.visual_projection(pooled_output)
@@ -1186,15 +1169,16 @@ def get_image_features(
return image_features
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- pixel_values: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- return_loss: Optional[bool]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[Tuple, HFCLIPOutput]:
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ pixel_values: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, HFCLIPOutput]:
r"""
Returns:
@@ -1221,19 +1205,18 @@ def forward(
>>> probs = F.softmax(logits_per_image.softmax, axis=1) # we can take the softmax to get the label probabilities
```"""
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
- output_attentions = (output_attentions if output_attentions is not None
- else self.config.output_attentions)
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
text_outputs = self.text_model(
input_ids=input_ids,
@@ -1241,7 +1224,8 @@ def forward(
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)
@@ -1250,14 +1234,12 @@ def forward(
text_embeds = self.text_projection(text_embeds)
# normalized features
- image_embeds = image_embeds / image_embeds.norm(
- p=2, axis=-1, keepdim=True)
+ image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
# cosine similarity as logits
logit_scale = self.logit_scale.exp()
- logits_per_text = paddle.matmul(text_embeds,
- image_embeds.t()) * logit_scale
+ logits_per_text = paddle.matmul(text_embeds, image_embeds.t()) * logit_scale
logits_per_image = logits_per_text.t()
loss = None
@@ -1271,8 +1253,9 @@ def forward(
text_embeds,
image_embeds,
text_outputs,
- vision_outputs, )
- return ((loss, ) + output) if loss is not None else output
+ vision_outputs,
+ )
+ return ((loss,) + output) if loss is not None else output
return HFCLIPOutput(
loss=loss,
@@ -1281,7 +1264,8 @@ def forward(
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
- vision_model_output=vision_outputs, )
+ vision_model_output=vision_outputs,
+ )
class HFCLIPTextModelWithProjection(HFCLIPPretrainedModel):
@@ -1294,8 +1278,7 @@ def __init__(self, config: CLIPTextConfig):
self.text_model = HFCLIPTextTransformer(config)
- self.text_projection = LinearClass(
- config.hidden_size, config.projection_dim, bias_attr=False)
+ self.text_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
# Initialize weights and apply final processing
self.post_init()
@@ -1307,14 +1290,14 @@ def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
def forward(
- self,
- input_ids: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- position_ids: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[Tuple,
- HFCLIPTextModelOutput]:
+ self,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, HFCLIPTextModelOutput]:
r"""
Returns:
@@ -1331,8 +1314,7 @@ def forward(
>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```"""
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.text_model(
input_ids=input_ids,
@@ -1340,7 +1322,8 @@ def forward(
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
pooled_output = text_outputs[1]
@@ -1354,7 +1337,8 @@ def forward(
text_embeds=text_embeds,
last_hidden_state=text_outputs.last_hidden_state,
hidden_states=text_outputs.hidden_states,
- attentions=text_outputs.attentions, )
+ attentions=text_outputs.attentions,
+ )
class HFCLIPVisionModelWithProjection(HFCLIPPretrainedModel):
@@ -1366,8 +1350,7 @@ def __init__(self, config: CLIPVisionConfig):
self.vision_model = HFCLIPVisionTransformer(config)
- self.visual_projection = LinearClass(
- config.hidden_size, config.projection_dim, bias_attr=False)
+ self.visual_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
# Initialize weights and apply final processing
self.post_init()
@@ -1376,12 +1359,12 @@ def get_input_embeddings(self) -> nn.Layer:
return self.vision_model.embeddings.patch_embedding
def forward(
- self,
- pixel_values: Optional[paddle.Tensor]=None,
- output_attentions: Optional[bool]=None,
- output_hidden_states: Optional[bool]=None,
- return_dict: Optional[bool]=None, ) -> Union[
- Tuple, HFCLIPVisionModelOutput]:
+ self,
+ pixel_values: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, HFCLIPVisionModelOutput]:
r"""
Returns:
@@ -1403,14 +1386,14 @@ def forward(
>>> outputs = model(**inputs)
>>> image_embeds = outputs.image_embeds
```"""
- return_dict = (return_dict if return_dict is not None else
- self.config.use_return_dict)
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
pooled_output = vision_outputs[1] # pooled_output
@@ -1424,4 +1407,5 @@ def forward(
image_embeds=image_embeds,
last_hidden_state=vision_outputs.last_hidden_state,
hidden_states=vision_outputs.hidden_states,
- attentions=vision_outputs.attentions, )
+ attentions=vision_outputs.attentions,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index c74cfb57a53b3..80cf9f98c1082 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -20,8 +20,7 @@
import paddle
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...loaders import TextualInversionLoaderMixin
@@ -46,11 +45,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -61,50 +56,46 @@ def preprocess(image):
return image
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator,
- eta):
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
# 1. get previous step value (=t-1)
- prev_timestep = (timestep - scheduler.config.num_train_timesteps //
- scheduler.num_inference_steps)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
if prev_timestep <= 0:
return clean_latents
# 2. compute alphas, betas
alpha_prod_t = scheduler.alphas_cumprod[timestep]
- alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
variance = scheduler._get_variance(timestep, prev_timestep)
- std_dev_t = eta * variance**(0.5)
+ std_dev_t = eta * variance ** (0.5)
# direction pointing to x_t
- e_t = (latents - alpha_prod_t**
- (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5)
- dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t
- noise = std_dev_t * randn_tensor(
- clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
- prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise
+ e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+ dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+ noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
+ prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
return prev_latents
def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
# 1. get previous step value (=t-1)
- prev_timestep = (timestep - scheduler.config.num_train_timesteps //
- scheduler.num_inference_steps)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
# 2. compute alphas, betas
alpha_prod_t = scheduler.alphas_cumprod[timestep]
- alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
beta_prod_t = 1 - alpha_prod_t
# 3. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_original_sample = (latents - beta_prod_t**
- (0.5) * noise_pred) / alpha_prod_t**(0.5)
+ pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
# 4. Clip "predicted x_0"
if scheduler.config.clip_sample:
@@ -113,16 +104,14 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
# 5. compute variance: "sigma_t(η)" -> see formula (16)
# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
variance = scheduler._get_variance(timestep, prev_timestep)
- std_dev_t = eta * variance**(0.5)
+ std_dev_t = eta * variance ** (0.5)
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
- 0.5) * noise_pred
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
- noise = (prev_latents -
- (alpha_prod_t_prev**
- (0.5) * pred_original_sample + pred_sample_direction)) / (
- variance**(0.5) * eta)
+ noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+ variance ** (0.5) * eta
+ )
return noise
@@ -156,31 +145,28 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: DDIMScheduler,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: DDIMScheduler,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
@@ -200,12 +186,10 @@ def __init__(
f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -216,12 +200,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -233,18 +214,20 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -284,29 +267,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -314,8 +299,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -325,21 +309,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -347,71 +332,67 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- strength,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -424,7 +405,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
@@ -433,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -449,11 +429,10 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -470,21 +449,14 @@ def decode_latents(self, latents):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
image = image.cast(dtype)
batch_size = image.shape[0]
@@ -496,8 +468,7 @@ def prepare_latents(self,
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -505,8 +476,7 @@ def prepare_latents(self,
init_latents = self.vae.config.scaling_factor * init_latents
- if (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] == 0):
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -518,20 +488,19 @@ def prepare_latents(self,
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = paddle.concat(
- [init_latents] * additional_image_per_prompt *
- num_images_per_prompt,
- axis=0, )
- elif (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] != 0):
+ [init_latents] * additional_image_per_prompt * num_images_per_prompt,
+ axis=0,
+ )
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
else:
- init_latents = paddle.concat(
- [init_latents] * num_images_per_prompt, axis=0)
+ init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
# add noise to latents using the timestep
shape = init_latents.shape
@@ -546,25 +515,25 @@ def prepare_latents(self,
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- source_prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[paddle.Tensor]=None,
- source_guidance_scale: Optional[float]=1,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]],
+ source_prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[paddle.Tensor] = None,
+ source_guidance_scale: Optional[float] = 1,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -639,7 +608,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -661,20 +631,19 @@ def __call__(
do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
source_prompt_embeds = self._encode_prompt(
- source_prompt, num_images_per_prompt, do_classifier_free_guidance,
- None)
+ source_prompt, num_images_per_prompt, do_classifier_free_guidance, None
+ )
# 4. Preprocess image
image = preprocess(image)
# 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
latents, clean_latents = self.prepare_latents(
@@ -683,7 +652,8 @@ def __call__(
batch_size,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
source_latents = latents
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -691,17 +661,14 @@ def __call__(
generator = extra_step_kwargs.pop("generator", None)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = paddle.concat([latents] * 2)
source_latent_model_input = paddle.concat([source_latents] * 2)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- source_latent_model_input = self.scheduler.scale_model_input(
- source_latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
# predict the noise residual
concat_latent_model_input = paddle.stack(
@@ -711,7 +678,8 @@ def __call__(
source_latent_model_input[1],
latent_model_input[1],
],
- axis=0, )
+ axis=0,
+ )
concat_prompt_embeds = paddle.stack(
[
source_prompt_embeds[0],
@@ -719,23 +687,25 @@ def __call__(
source_prompt_embeds[1],
prompt_embeds[1],
],
- axis=0, )
+ axis=0,
+ )
concat_noise_pred = self.unet(
concat_latent_model_input,
t,
- encoder_hidden_states=concat_prompt_embeds, ).sample
+ encoder_hidden_states=concat_prompt_embeds,
+ ).sample
# perform guidance
(
source_noise_pred_uncond,
noise_pred_uncond,
source_noise_pred_text,
- noise_pred_text, ) = concat_noise_pred.chunk(
- 4, axis=0)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_text,
+ ) = concat_noise_pred.chunk(4, axis=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
- source_noise_pred_text - source_noise_pred_uncond)
+ source_noise_pred_text - source_noise_pred_uncond
+ )
# Sample source_latents from the posterior distribution.
prev_source_latents = posterior_sample(
@@ -744,7 +714,8 @@ def __call__(
t,
clean_latents,
generator=generator,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
# Compute noise.
noise = compute_noise(
self.scheduler,
@@ -752,21 +723,17 @@ def __call__(
source_latents,
t,
source_noise_pred,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
source_latents = prev_source_latents
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- variance_noise=noise,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+ ).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -775,8 +742,7 @@ def __call__(
image = self.decode_latents(latents)
# 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 11. Convert to PIL
if output_type == "pil":
@@ -785,5 +751,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
index 33a4cd8838fe2..31fc2eb7d9db6 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
@@ -22,57 +22,52 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import DDIMScheduler
from ...utils import logging, randn_tensor
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from . import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__)
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator,
- eta):
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
# 1. get previous step value (=t-1)
- prev_timestep = (timestep - scheduler.config.num_train_timesteps //
- scheduler.num_inference_steps)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
if prev_timestep <= 0:
return clean_latents
# 2. compute alphas, betas
alpha_prod_t = scheduler.alphas_cumprod[timestep]
- alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
variance = scheduler._get_variance(timestep, prev_timestep)
- std_dev_t = eta * variance**(0.5)
+ std_dev_t = eta * variance ** (0.5)
# direction pointing to x_t
- e_t = (latents - alpha_prod_t**
- (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5)
- dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t
- noise = std_dev_t * randn_tensor(
- clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
- prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise
+ e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+ dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+ noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
+ prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
return prev_latents
def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
# 1. get previous step value (=t-1)
- prev_timestep = (timestep - scheduler.config.num_train_timesteps //
- scheduler.num_inference_steps)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
# 2. compute alphas, betas
alpha_prod_t = scheduler.alphas_cumprod[timestep]
- alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
beta_prod_t = 1 - alpha_prod_t
# 3. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_original_sample = (latents - beta_prod_t**
- (0.5) * noise_pred) / alpha_prod_t**(0.5)
+ pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
# 4. Clip "predicted x_0"
if scheduler.config.clip_sample:
@@ -81,21 +76,18 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
# 5. compute variance: "sigma_t(η)" -> see formula (16)
# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
variance = scheduler._get_variance(timestep, prev_timestep)
- std_dev_t = eta * variance**(0.5)
+ std_dev_t = eta * variance ** (0.5)
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
- 0.5) * noise_pred
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
- noise = (prev_latents -
- (alpha_prod_t_prev**
- (0.5) * pred_original_sample + pred_sample_direction)) / (
- variance**(0.5) * eta)
+ noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+ variance ** (0.5) * eta
+ )
return noise
-class FastDeployCycleDiffusionPipeline(DiffusionPipeline,
- FastDeployDiffusionPipelineMixin):
+class FastDeployCycleDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-guided image to image generation using Stable Diffusion.
@@ -125,16 +117,17 @@ class FastDeployCycleDiffusionPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: DDIMScheduler,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: DDIMScheduler,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -159,37 +152,38 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
self.change_scheduler("ddim")
def __call__(
- self,
- prompt: Union[str, List[str]],
- source_prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[paddle.Tensor]=None,
- source_guidance_scale: Optional[float]=1,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.1,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ source_prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[paddle.Tensor] = None,
+ source_guidance_scale: Optional[float] = 1,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.1,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -266,8 +260,7 @@ def __call__(
(nsfw) content, according to the `safety_checker`.
"""
# 0. Preprocess image
- init_image = self.image_processor.preprocess(
- image, height=height, width=width)
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -279,7 +272,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
@@ -305,23 +299,23 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
source_prompt_embeds = self._encode_prompt(
source_prompt,
num_images_per_prompt,
do_classifier_free_guidance,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 6. Prepare latent variables
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
is_strength_max = strength == 1.0
latents, clean_latents = self.prepare_latents(
batch_size * num_images_per_prompt,
@@ -333,7 +327,8 @@ def __call__(
timestep=latent_timestep,
is_strength_max=is_strength_max,
return_image_latents=True,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
source_latents = latents
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -341,18 +336,15 @@ def __call__(
generator = extra_step_kwargs.pop("generator", None)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = paddle.concat([latents] * 2)
source_latent_model_input = paddle.concat([source_latents] * 2)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- source_latent_model_input = self.scheduler.scale_model_input(
- source_latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
# predict the noise residual
concat_latent_model_input = paddle.stack(
@@ -362,7 +354,8 @@ def __call__(
source_latent_model_input[1],
latent_model_input[1],
],
- axis=0, )
+ axis=0,
+ )
concat_prompt_embeds = paddle.stack(
[
source_prompt_embeds[0],
@@ -370,14 +363,16 @@ def __call__(
source_prompt_embeds[1],
prompt_embeds[1],
],
- axis=0, )
+ axis=0,
+ )
unet_inputs = dict(
sample=concat_latent_model_input,
timestep=t,
encoder_hidden_states=concat_prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=concat_latent_model_input.shape, )
+ output_shape=concat_latent_model_input.shape,
+ )
# predict the noise residual
concat_noise_pred = self.unet(**unet_inputs)[0]
@@ -386,12 +381,12 @@ def __call__(
source_noise_pred_uncond,
noise_pred_uncond,
source_noise_pred_text,
- noise_pred_text, ) = concat_noise_pred.chunk(
- 4, axis=0)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_text,
+ ) = concat_noise_pred.chunk(4, axis=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
- source_noise_pred_text - source_noise_pred_uncond)
+ source_noise_pred_text - source_noise_pred_uncond
+ )
# Sample source_latents from the posterior distribution.
prev_source_latents = posterior_sample(
@@ -400,7 +395,8 @@ def __call__(
t,
clean_latents,
generator=generator,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
# Compute noise.
noise = compute_noise(
self.scheduler,
@@ -408,20 +404,16 @@ def __call__(
source_latents,
t,
source_noise_pred,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
source_latents = prev_source_latents
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(
- noise_pred,
- t,
- latents,
- variance_noise=noise,
- **extra_step_kwargs).prev_sample
+ noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+ ).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -432,7 +424,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -443,11 +436,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
index 61110d7638d0f..8de1b7b464dfb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
@@ -22,15 +22,13 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from . import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class FastDeployStableDiffusionPipeline(DiffusionPipeline,
- FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -60,21 +58,20 @@ class FastDeployStableDiffusionPipeline(DiffusionPipeline,
feature_extractor ([`CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
"""
- _optional_components = [
- "vae_encoder", "safety_checker", "feature_extractor"
- ]
+ _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -99,34 +96,35 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -200,7 +198,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
@@ -226,7 +225,8 @@ def __call__(
height=height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -238,7 +238,8 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -250,47 +251,42 @@ def __call__(
height,
width,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
# compute the previous noisy sample x_t -> x_t-1
@@ -301,15 +297,13 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -320,7 +314,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -331,11 +326,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
index 6d1b14edfaa32..324d66f3e0187 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
@@ -13,16 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from .pipeline_fastdeploy_stable_diffusion import \
- FastDeployStableDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
-class FastDeployStableDiffusionControlNetPipeline(
- FastDeployStableDiffusionPipeline):
+class FastDeployStableDiffusionControlNetPipeline(FastDeployStableDiffusionPipeline):
def __call__(
- self,
- *args,
- **kwargs, ):
+ self,
+ *args,
+ **kwargs,
+ ):
controlnet_cond = kwargs.pop("controlnet_cond", None)
image = kwargs.pop("image", None)
if controlnet_cond is None:
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
index 7f92020a9d9dc..b90541cfd23a1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
@@ -20,7 +20,9 @@
from paddlenlp.transformers import CLIPImageProcessor
from ppdiffusers.pipelines.fastdeploy_utils import (
- FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
+ FastDeployDiffusionPipelineMixin,
+ FastDeployRuntimeModel,
+)
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging
@@ -30,8 +32,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class FastDeployStableDiffusionImageVariationPipeline(
- DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionImageVariationPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline to generate variations from an input image using Stable Diffusion.
@@ -59,15 +60,16 @@ class FastDeployStableDiffusionImageVariationPipeline(
_optional_components = ["safety_checker"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- image_encoder: FastDeployRuntimeModel,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ image_encoder: FastDeployRuntimeModel,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -93,28 +95,27 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
- def _encode_image(self, image, num_images_per_prompt,
- do_classifier_free_guidance, infer_op_dict):
+ def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict):
if not isinstance(image, paddle.Tensor):
- image = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
image_encoder_inputs = dict(
pixel_values=image,
infer_op=infer_op_dict.get("image_encoder", None),
- output_shape=[image.shape[0], 768], )
+ output_shape=[image.shape[0], 768],
+ )
image_embeddings = self.image_encoder(**image_encoder_inputs)[0]
image_embeddings = image_embeddings.unsqueeze(1)
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
- image_embeddings = image_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
negative_prompt_embeds = paddle.zeros_like(image_embeddings)
@@ -122,49 +123,50 @@ def _encode_image(self, image, num_images_per_prompt,
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- image_embeddings = paddle.concat(
- [negative_prompt_embeds, image_embeddings])
+ image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
return image_embeddings
def check_inputs(self, image, height, width, callback_steps):
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
- f" {type(image)}")
+ f" {type(image)}"
+ )
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
@paddle.no_grad()
def __call__(
- self,
- image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -242,9 +244,7 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input image
- image_embeddings = self._encode_image(image, num_images_per_prompt,
- do_classifier_free_guidance,
- infer_op_dict)
+ image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict)
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -256,26 +256,23 @@ def __call__(
height,
width,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
unet_inputs = dict(
@@ -283,14 +280,14 @@ def __call__(
timestep=t,
encoder_hidden_states=image_embeddings,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
noise_pred = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
if is_scheduler_support_step_index:
@@ -300,16 +297,14 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -320,7 +315,8 @@ def __call__(
# 8. Post-processing
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
# 9. Run safety checker
image, has_nsfw_concept = self.run_safety_checker(image)
@@ -330,11 +326,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
index c282d47747dec..49f736a9c71c5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
@@ -22,15 +22,13 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from . import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class FastDeployStableDiffusionImg2ImgPipeline(
- DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionImg2ImgPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-guided image-to-image generation using Stable Diffusion.
@@ -63,16 +61,17 @@ class FastDeployStableDiffusionImg2ImgPipeline(
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -97,36 +96,37 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -199,8 +199,7 @@ def __call__(
(nsfw) content, according to the `safety_checker`.
"""
# 0. Preprocess image
- init_image = self.image_processor.preprocess(
- image, height=height, width=width)
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
height, width = init_image.shape[-2:]
# 1. Check inputs. Raise error if not correct
@@ -212,7 +211,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
@@ -238,7 +238,8 @@ def __call__(
height=height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -250,17 +251,16 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# 5. Prepare latent variables
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
latents = self.prepare_latents(
@@ -272,47 +272,42 @@ def __call__(
image=init_image,
timestep=latent_timestep,
is_strength_max=is_strength_max,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -324,16 +319,14 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -344,7 +337,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -355,11 +349,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
index 4fdbacaaf890a..2ae694a4f8e2f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
@@ -23,18 +23,13 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from . import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-def prepare_mask_and_masked_image(image,
- mask,
- height=None,
- width=None,
- return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image,
if isinstance(image, paddle.Tensor):
if not isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
- )
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
# Batch single image
if image.ndim == 3:
- assert (image.shape[0] == 3
- ), "Image outside a batch should be of shape (3, H, W)"
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
# Batch and add channel dim for single mask
@@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image,
else:
mask = mask.unsqueeze(1)
- assert (image.ndim == 4 and
- mask.ndim == 4), "Image and Mask must have 4 dimensions"
- assert (image.shape[-2:] == mask.shape[-2:]
- ), "Image and Mask must have the same spatial dimensions"
- assert (image.shape[0] == mask.shape[0]
- ), "Image and Mask must have the same batch size"
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
# Check image is in [-1, 1]
if image.min() < -1 or image.max() > 1:
@@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image,
# Image as float32
image = image.cast(dtype=paddle.float32)
elif isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
else:
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image,
w, h = image[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- image = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"])
- for i in image
- ]
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image,
w, h = mask[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- mask = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask
- ]
- mask = np.concatenate(
- [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ mask = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask]
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image,
return mask, masked_image
-class FastDeployStableDiffusionInpaintPipeline(
- DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionInpaintPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-guided image inpainting using Stable Diffusion.
@@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipeline(
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -247,38 +225,39 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: int=None,
- width: int=None,
- strength: float=1.0,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: int = None,
+ width: int = None,
+ strength: float = 1.0,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -363,7 +342,8 @@ def __call__(
mask_image,
height,
width,
- return_image=True, )
+ return_image=True,
+ )
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -375,7 +355,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
@@ -401,15 +382,14 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
@@ -429,7 +409,8 @@ def __call__(
is_strength_max=is_strength_max,
return_noise=True,
return_image_latents=return_image_latents,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
if return_image_latents:
latents, noise, image_latents = latents_outputs
@@ -445,24 +426,23 @@ def __call__(
width,
do_classifier_free_guidance,
return_masked_image_latents=True,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
# 7. Check that sizes of mask, masked image and latents match
if num_channels_unet == 9:
# default case for runwayml/stable-diffusion-inpainting
num_channels_mask = mask.shape[1]
num_channels_masked_image = masked_image_latents.shape[1]
- if (num_channels_latents + num_channels_mask +
- num_channels_masked_image != num_channels_unet):
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
raise ValueError(
f"Incorrect configuration settings! Received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
- " `pipeline.unet` or your `mask_image` or `image` input.")
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
elif num_channels_unet != 4:
- raise ValueError(
- f"The unet should have either 4 or 9 input channels, not {num_channels_unet}."
- )
+ raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
# do_controlnet
do_controlnet = controlnet_cond is not None and num_channels_unet == 4
if do_controlnet:
@@ -473,59 +453,52 @@ def __call__(
height=height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
if do_classifier_free_guidance:
- init_mask = mask[:mask.shape[0] // 2]
+ init_mask = mask[: mask.shape[0] // 2]
else:
init_mask = mask
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
output_shape = latent_model_input.shape
if not is_legacy:
# concat latents, mask, masked_image_latents in the channel dimension
- latent_model_input = paddle.concat(
- [latent_model_input, mask, masked_image_latents],
- axis=1)
+ latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=output_shape, )
+ output_shape=output_shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -537,32 +510,27 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
if is_legacy:
if i < len(timesteps) - 1:
# masking
if add_predicted_noise:
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise_pred_uncond, t)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
else:
# https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
noise_timestep = timesteps[i + 1]
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise, noise_timestep)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
else:
init_latents_proper = image_latents
- latents = (1 - init_mask
- ) * init_latents_proper + init_mask * latents
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -573,7 +541,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -584,11 +553,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
index 59c3a5bd12dec..7d2c1d82e5651 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
@@ -23,18 +23,13 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from . import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-def prepare_mask_and_masked_image(image,
- mask,
- height=None,
- width=None,
- return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image,
if isinstance(image, paddle.Tensor):
if not isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
- )
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
# Batch single image
if image.ndim == 3:
- assert (image.shape[0] == 3
- ), "Image outside a batch should be of shape (3, H, W)"
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
# Batch and add channel dim for single mask
@@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image,
else:
mask = mask.unsqueeze(1)
- assert (image.ndim == 4 and
- mask.ndim == 4), "Image and Mask must have 4 dimensions"
- assert (image.shape[-2:] == mask.shape[-2:]
- ), "Image and Mask must have the same spatial dimensions"
- assert (image.shape[0] == mask.shape[0]
- ), "Image and Mask must have the same batch size"
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
# Check image is in [-1, 1]
if image.min() < -1 or image.max() > 1:
@@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image,
# Image as float32
image = image.cast(dtype=paddle.float32)
elif isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
else:
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image,
w, h = image[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- image = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"])
- for i in image
- ]
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image,
w, h = mask[0].size
else:
w, h = width, height
- w, h = (x - x % 8
- for x in (w, h)) # resize to integer multiple of 8
- mask = [
- i.resize(
- (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask
- ]
- mask = np.concatenate(
- [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image,
return mask, masked_image
-class FastDeployStableDiffusionInpaintPipelineLegacy(
- DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionInpaintPipelineLegacy(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
r"""
Pipeline for text-guided image inpainting legacy using Stable Diffusion.
@@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipelineLegacy(
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae_encoder: FastDeployRuntimeModel,
- vae_decoder: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: CLIPTokenizer,
- unet: FastDeployRuntimeModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: FastDeployRuntimeModel,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=False, ):
+ self,
+ vae_encoder: FastDeployRuntimeModel,
+ vae_decoder: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: FastDeployRuntimeModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: FastDeployRuntimeModel,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = False,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -247,38 +225,39 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.post_init()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: int=None,
- width: int=None,
- strength: float=1.0,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: int = None,
+ width: int = None,
+ strength: float = 1.0,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -363,7 +342,8 @@ def __call__(
mask_image,
height,
width,
- return_image=True, )
+ return_image=True,
+ )
height, width = init_image.shape[-2:]
# 1. Check inputs
@@ -375,7 +355,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- strength, )
+ strength,
+ )
infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -400,7 +381,8 @@ def __call__(
height=height,
batch_size=batch_size,
num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -412,15 +394,14 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
is_strength_max = strength == 1.0
@@ -436,7 +417,8 @@ def __call__(
is_strength_max=is_strength_max,
return_noise=True,
return_image_latents=True,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
# 6. Prepare mask latent variables
mask = self.prepare_mask_latents(
@@ -447,52 +429,47 @@ def __call__(
width,
do_classifier_free_guidance,
return_masked_image_latents=False,
- infer_op=infer_op_dict.get("vae_encoder", None), )
+ infer_op=infer_op_dict.get("vae_encoder", None),
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
if do_classifier_free_guidance:
- init_mask = mask[:mask.shape[0] // 2]
+ init_mask = mask[: mask.shape[0] // 2]
else:
init_mask = mask
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
if do_controlnet:
unet_inputs["controlnet_cond"] = control_image
- unet_inputs[
- "controlnet_conditioning_scale"] = control_conditioning_scale
+ unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -504,32 +481,27 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
if i < len(timesteps) - 1:
# masking
if add_predicted_noise:
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise_pred_uncond, t)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
else:
# https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
noise_timestep = timesteps[i + 1]
- init_latents_proper = self.scheduler.add_noise(
- image_latents, noise, noise_timestep)
+ init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
else:
init_latents_proper = image_latents
- latents = (1 - init_mask
- ) * init_latents_proper + init_mask * latents
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -540,7 +512,8 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
image, has_nsfw_concept = self.run_safety_checker(image)
else:
image = latents
@@ -551,11 +524,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
index 7f66d4caec169..d2c9622fd7c8a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
@@ -19,16 +19,17 @@
import PIL.Image
from ...utils import logging
-from .pipeline_fastdeploy_cycle_diffusion import \
- FastDeployCycleDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion import \
- FastDeployStableDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion_img2img import \
- FastDeployStableDiffusionImg2ImgPipeline
-from .pipeline_fastdeploy_stable_diffusion_inpaint import \
- FastDeployStableDiffusionInpaintPipeline
-from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \
- FastDeployStableDiffusionInpaintPipelineLegacy
+from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion_img2img import (
+ FastDeployStableDiffusionImg2ImgPipeline,
+)
+from .pipeline_fastdeploy_stable_diffusion_inpaint import (
+ FastDeployStableDiffusionInpaintPipeline,
+)
+from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
+ FastDeployStableDiffusionInpaintPipelineLegacy,
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -63,45 +64,39 @@ class FastDeployStableDiffusionMegaPipeline(FastDeployStableDiffusionPipeline):
feature_extractor ([`CLIPFeatureExtractor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
"""
- _optional_components = [
- "vae_encoder", "safety_checker", "feature_extractor"
- ]
+ _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
def __call__(self, *args, **kwargs):
return self.text2img(*args, **kwargs)
def text2img(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=512,
- width: Optional[int]=512,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
- expected_components = inspect.signature(
- FastDeployStableDiffusionPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(FastDeployStableDiffusionPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = FastDeployStableDiffusionPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
temp_pipeline._progress_bar_config = self._progress_bar_config
output = temp_pipeline(
prompt=prompt,
@@ -122,42 +117,39 @@ def text2img(
callback_steps=callback_steps,
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
- infer_op_dict=infer_op_dict, )
+ infer_op_dict=infer_op_dict,
+ )
return output
def img2img(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
- expected_components = inspect.signature(
- FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
+ expected_components = inspect.signature(FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = FastDeployStableDiffusionImg2ImgPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
temp_pipeline._progress_bar_config = self._progress_bar_config
output = temp_pipeline(
prompt=prompt,
@@ -180,48 +172,46 @@ def img2img(
callback_steps=callback_steps,
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
- infer_op_dict=infer_op_dict, )
+ infer_op_dict=infer_op_dict,
+ )
return output
def inpaint_legacy(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
assert (
self.unet_num_latent_channels == 4
), f"Detected `unet_num_latent_channels` is {self.unet_num_latent_channels}, Plese use `inpaint` method."
expected_components = inspect.signature(
- FastDeployStableDiffusionInpaintPipelineLegacy.
- __init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ FastDeployStableDiffusionInpaintPipelineLegacy.__init__
+ ).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = FastDeployStableDiffusionInpaintPipelineLegacy(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
temp_pipeline._progress_bar_config = self._progress_bar_config
output = temp_pipeline(
prompt=prompt,
@@ -245,45 +235,42 @@ def inpaint_legacy(
callback_steps=callback_steps,
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
- infer_op_dict=infer_op_dict, )
+ infer_op_dict=infer_op_dict,
+ )
return output
def inpaint(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- height=None,
- width=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
- controlnet_conditioning_scale: float=1.0,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ height=None,
+ width=None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ infer_op_dict: Dict[str, str] = None,
+ ):
assert self.unet_num_latent_channels in [4, 9]
- expected_components = inspect.signature(
- FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = FastDeployStableDiffusionInpaintPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
temp_pipeline._progress_bar_config = self._progress_bar_config
output = temp_pipeline(
prompt=prompt,
@@ -307,46 +294,42 @@ def inpaint(
callback_steps=callback_steps,
controlnet_cond=controlnet_cond,
controlnet_conditioning_scale=controlnet_conditioning_scale,
- infer_op_dict=infer_op_dict, )
+ infer_op_dict=infer_op_dict,
+ )
return output
def cycle_diffusion(
- self,
- prompt: Union[str, List[str]],
- source_prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[paddle.Tensor]=None,
- source_guidance_scale: Optional[float]=1,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.1,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- infer_op_dict: Dict[str, str]=None, ):
- expected_components = inspect.signature(
- FastDeployCycleDiffusionPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ self,
+ prompt: Union[str, List[str]],
+ source_prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[paddle.Tensor] = None,
+ source_guidance_scale: Optional[float] = 1,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.1,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ infer_op_dict: Dict[str, str] = None,
+ ):
+ expected_components = inspect.signature(FastDeployCycleDiffusionPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = FastDeployCycleDiffusionPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
temp_pipeline._progress_bar_config = self._progress_bar_config
output = temp_pipeline(
prompt=prompt,
@@ -371,6 +354,7 @@ def cycle_diffusion(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- infer_op_dict=infer_op_dict, )
+ infer_op_dict=infer_op_dict,
+ )
return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
index 05ff6fa970504..db0660a1cbb90 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
@@ -21,24 +21,23 @@
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import DDPMScheduler
from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
- FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
from ..pipeline_utils import ImagePipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class FastDeployStableDiffusionUpscalePipeline(
- DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionUpscalePipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
def __init__(
- self,
- vae: FastDeployRuntimeModel,
- text_encoder: FastDeployRuntimeModel,
- tokenizer: Any,
- unet: FastDeployRuntimeModel,
- low_res_scheduler: DDPMScheduler,
- scheduler: Any,
- max_noise_level: int=350, ):
+ self,
+ vae: FastDeployRuntimeModel,
+ text_encoder: FastDeployRuntimeModel,
+ tokenizer: Any,
+ unet: FastDeployRuntimeModel,
+ low_res_scheduler: DDPMScheduler,
+ scheduler: Any,
+ max_noise_level: int = 350,
+ ):
super().__init__(
vae=vae,
text_encoder=text_encoder,
@@ -49,18 +48,19 @@ def __init__(
safety_checker=None,
feature_extractor=None,
watermarker=None,
- max_noise_level=max_noise_level, )
+ max_noise_level=max_noise_level,
+ )
self.post_init(vae_scaling_factor=0.08333)
def check_inputs(self, prompt, image, noise_level, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
)
@@ -83,39 +83,38 @@ def check_inputs(self, prompt, image, noise_level, callback_steps):
# check noise level
if noise_level > self.config.max_noise_level:
- raise ValueError(
- f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}"
- )
+ raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
def __call__(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
- num_inference_steps: int=75,
- guidance_scale: float=9.0,
- noise_level: int=20,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- parse_prompt_type: Optional[str]="lpw",
- max_embeddings_multiples: Optional[int]=3,
- prompt_embeds: Optional[np.ndarray]=None,
- negative_prompt_embeds: Optional[np.ndarray]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- infer_op_dict: Dict[str, str]=None, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
+ num_inference_steps: int = 75,
+ guidance_scale: float = 9.0,
+ noise_level: int = 20,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ parse_prompt_type: Optional[str] = "lpw",
+ max_embeddings_multiples: Optional[int] = 3,
+ prompt_embeds: Optional[np.ndarray] = None,
+ negative_prompt_embeds: Optional[np.ndarray] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ infer_op_dict: Dict[str, str] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -204,7 +203,8 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
parse_prompt_type=parse_prompt_type,
max_embeddings_multiples=max_embeddings_multiples,
- infer_op=infer_op_dict.get("text_encoder", None), )
+ infer_op=infer_op_dict.get("text_encoder", None),
+ )
# 4. Preprocess image
image = self.image_processor.preprocess(image)
@@ -215,13 +215,11 @@ def __call__(
# 5. Add noise to image
noise_level = paddle.to_tensor([noise_level], dtype="int64")
- noise = paddle.randn(
- image.shape, generator=generator, dtype=text_embeddings.dtype)
+ noise = paddle.randn(image.shape, generator=generator, dtype=text_embeddings.dtype)
image = self.low_res_scheduler.add_noise(image, noise, noise_level)
batch_multiplier = 2 if do_classifier_free_guidance else 1
- image = paddle.concat([image] * batch_multiplier *
- num_images_per_prompt)
+ image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
noise_level = paddle.concat([noise_level] * image.shape[0])
# 6. Prepare latent variables
@@ -231,7 +229,8 @@ def __call__(
height,
width,
generator,
- latents, )
+ latents,
+ )
NUM_UNET_INPUT_CHANNELS = self.unet_num_latent_channels
NUM_LATENT_CHANNELS = self.vae_decoder_num_latent_channels
@@ -243,27 +242,24 @@ def __call__(
f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +"
f" `num_channels_image`: {num_channels_image} "
f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of"
- " `pipeline.unet` or your `image` input.")
+ " `pipeline.unet` or your `image` input."
+ )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
is_scheduler_support_step_index = self.is_scheduler_support_step_index()
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
if is_scheduler_support_step_index:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t, step_index=i)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
else:
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
unet_inputs = dict(
sample=paddle.concat(
@@ -272,16 +268,15 @@ def __call__(
timestep=t,
encoder_hidden_states=prompt_embeds,
infer_op=infer_op_dict.get("unet", None),
- output_shape=latent_model_input.shape, )
+ output_shape=latent_model_input.shape,
+ )
# predict the noise residual
noise_pred_unet = self.unet(**unet_inputs)[0]
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
- 2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
else:
noise_pred = noise_pred_unet
@@ -293,16 +288,14 @@ def __call__(
latents,
step_index=i,
return_pred_original_sample=False,
- **extra_step_kwargs, )
+ **extra_step_kwargs,
+ )
else:
- scheduler_output = self.scheduler.step(
- noise_pred, t, latents, **extra_step_kwargs)
+ scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
latents = scheduler_output.prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -313,16 +306,18 @@ def __call__(
if not output_type == "latent":
image = self._decode_vae_latents(
latents / self.vae_scaling_factor,
- infer_op=infer_op_dict.get("vae_decoder", None), )
+ infer_op=infer_op_dict.get("vae_decoder", None),
+ )
else:
image = latents
do_denormalize = [True] * image.shape[0]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
- return (image, )
+ return (image,)
- return ImagePipelineOutput(images=image, )
+ return ImagePipelineOutput(
+ images=image,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 75f8db28f0c67..b847facb71074 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -18,16 +18,13 @@
import paddle
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
- TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -48,8 +45,7 @@
"""
-class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
- LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -87,37 +83,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -125,11 +117,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -150,12 +138,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -166,12 +152,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -183,18 +166,20 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -234,29 +219,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -264,8 +251,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -275,21 +261,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -297,46 +284,42 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -355,53 +338,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -414,17 +393,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -447,25 +428,25 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -546,7 +527,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -568,7 +550,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -583,43 +566,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -632,8 +610,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
@@ -642,11 +619,9 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
index 7b6cf35b03da0..0ec8990c31e59 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
@@ -18,15 +18,12 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
# from ...loaders import TextualInversionLoaderMixin
-from ...models import (AutoencoderKL, MultiAdapter, T2IAdapter,
- UNet2DConditionModel)
+from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -81,8 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
else:
h = int(round(img_size / 8 / coef) * 8)
- images = images.resize(
- (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+ images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
return images
@@ -95,12 +91,8 @@ def preprocess(image):
if isinstance(image[0], PIL.Image.Image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h))
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image
- ]
- image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...])
- for i in image]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+ image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -155,17 +147,18 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- adapter_weights: Optional[List[float]]=None,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ adapter_weights: Optional[List[float]] = None,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -185,8 +178,9 @@ def __init__(
adapter=adapter,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_vae_slicing(self):
@@ -206,13 +200,14 @@ def disable_vae_slicing(self):
self.vae.disable_slicing()
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
"""
Encodes the prompt into text encoder hidden states.
@@ -249,32 +244,29 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
)
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
- prompt_embeds = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.astype(self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
- prompt_embeds = prompt_embeds.tile(
- repeat_times=[1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance and negative_prompt_embeds is None:
uncond_tokens: List[str]
if negative_prompt is None:
@@ -300,34 +292,28 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ return_tensors="pd",
+ )
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
- negative_prompt_embeds = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)
+ negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.astype(
- self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- repeat_times=[1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- (batch_size * num_images_per_prompt, seq_len, -1))
- prompt_embeds = paddle.concat(
- x=[negative_prompt_embeds, prompt_embeds])
+ negative_prompt_embeds = negative_prompt_embeds.astype(self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.astype(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.astype(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -336,37 +322,36 @@ def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clip(min=0, max=1)
- image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(
- dtype="float32").numpy()
+ image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(dtype="float32").numpy()
return image
def prepare_extra_step_kwargs(self, generator, eta):
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
- if (callback_steps is None or callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+ if (
+ callback_steps is None
+ or callback_steps is not None
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
)
@@ -378,11 +363,8 @@ def check_inputs(
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -394,19 +376,21 @@ def check_inputs(
)
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -438,28 +422,27 @@ def _default_height_width(self, height, width, image):
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image, List[
- PIL.Image.Image]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- adapter_conditioning_scale: Union[float, List[float]]=1.0, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -550,13 +533,13 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
is_multi_adapter = isinstance(self.adapter, MultiAdapter)
if is_multi_adapter:
adapter_input = [preprocess(img) for img in image]
n, c, h, w = adapter_input[0].shape
- adapter_input = paddle.stack(
- x=[x.reshape([n * c, h, w]) for x in adapter_input])
+ adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
else:
adapter_input = preprocess(image)
adapter_input = adapter_input.astype(self.adapter.dtype)
@@ -573,7 +556,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
num_channels_latents = self.unet.in_channels
@@ -584,43 +568,35 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
adapter_state = self.adapter(adapter_input)
for k, v in enumerate(adapter_state):
adapter_state[k] = v * adapter_conditioning_scale
if num_images_per_prompt > 1:
for k, v in enumerate(adapter_state):
- adapter_state[k] = v.tile(
- repeat_times=[num_images_per_prompt, 1, 1, 1])
+ adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1])
if do_classifier_free_guidance:
for k, v in enumerate(adapter_state):
adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
- latent_model_input = (paddle.concat(x=[latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- down_block_additional_residuals=[
- state.clone() for state in adapter_state
- ], ).sample
+ down_block_additional_residuals=[state.clone() for state in adapter_state],
+ ).sample
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
- if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -629,14 +605,11 @@ def __call__(
has_nsfw_concept = None
elif output_type == "pil":
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
image = self.numpy_to_pil(image)
else:
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return image, has_nsfw_concept
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
index 3deff63114cd2..3971ea99471d6 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
@@ -25,17 +25,20 @@
import PIL
import PIL.Image
from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
- TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...pipeline_utils import DiffusionPipeline
from ...schedulers import (
- DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler)
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
from ...utils import PIL_INTERPOLATION, deprecate, logging
from ...utils.testing_utils import load_image
from . import StableDiffusionPipelineOutput
@@ -86,7 +89,8 @@ def save_all(images, FORMAT="jpg", OUTDIR="./outputs/"):
[^\\()\[\]:]+|
:
""",
- re.X, )
+ re.X,
+)
def parse_prompt_attention(text):
@@ -175,9 +179,7 @@ def multiply_range(start_position, multiplier):
return res
-def get_prompts_with_weights(pipe: DiffusionPipeline,
- prompt: List[str],
- max_length: int):
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
r"""
Tokenize a list of prompts and return its tokens with weights of each token.
@@ -212,32 +214,20 @@ def get_prompts_with_weights(pipe: DiffusionPipeline,
tokens.append(text_token)
weights.append(text_weight)
if truncated:
- logger.warning(
- "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
- )
+ logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
return tokens, weights
-def pad_tokens_and_weights(tokens,
- weights,
- max_length,
- bos,
- eos,
- pad,
- no_boseos_middle=True,
- chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
r"""
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
"""
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
- weights_length = (max_length if no_boseos_middle else
- max_embeddings_multiples * chunk_length)
+ weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
for i in range(len(tokens)):
- tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
- (max_length - 2 - len(tokens[i])))
+ tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
if no_boseos_middle:
- weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
- len(weights[i]))
+ weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
else:
w = []
if len(weights[i]) == 0:
@@ -245,8 +235,7 @@ def pad_tokens_and_weights(tokens,
else:
for j in range(max_embeddings_multiples):
w.append(1.0) # weight for starting token in this chunk
- w += weights[i][j * (chunk_length - 2):min(
- len(weights[i]), (j + 1) * (chunk_length - 2))]
+ w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
w.append(1.0) # weight for ending token in this chunk
w += [1.0] * (weights_length - len(w))
weights[i] = w[:]
@@ -255,10 +244,11 @@ def pad_tokens_and_weights(tokens,
def get_unweighted_text_embeddings(
- pipe: DiffusionPipeline,
- text_input: paddle.Tensor,
- chunk_length: int,
- no_boseos_middle: Optional[bool]=True, ):
+ pipe: DiffusionPipeline,
+ text_input: paddle.Tensor,
+ chunk_length: int,
+ no_boseos_middle: Optional[bool] = True,
+):
"""
When the length of tokens is a multiple of the capacity of the text encoder,
it should be split into chunks and sent to the text encoder individually.
@@ -268,8 +258,7 @@ def get_unweighted_text_embeddings(
text_embeddings = []
for i in range(max_embeddings_multiples):
# extract the i-th chunk
- text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
- chunk_length - 2) + 2].clone()
+ text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
# cover the head and the tail by the starting and the ending tokens
text_input_chunk[:, 0] = text_input[0, 0]
@@ -296,14 +285,15 @@ def get_unweighted_text_embeddings(
def get_weighted_text_embeddings(
- pipe: DiffusionPipeline,
- prompt: Union[str, List[str]],
- uncond_prompt: Optional[Union[str, List[str]]]=None,
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ pipe: DiffusionPipeline,
+ prompt: Union[str, List[str]],
+ uncond_prompt: Optional[Union[str, List[str]]] = None,
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+):
r"""
Prompts can be assigned with local weights using brackets. For example,
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -329,24 +319,19 @@ def get_weighted_text_embeddings(
skip_weighting (`bool`, *optional*, defaults to `False`):
Skip the weighting. When the parsing is skipped, it is forced True.
"""
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
if isinstance(prompt, str):
prompt = [prompt]
if not skip_parsing:
- prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
- max_length - 2)
+ prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
if uncond_prompt is not None:
if isinstance(uncond_prompt, str):
uncond_prompt = [uncond_prompt]
- uncond_tokens, uncond_weights = get_prompts_with_weights(
- pipe, uncond_prompt, max_length - 2)
+ uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
else:
prompt_tokens = [
- token[1:-1]
- for token in pipe.tokenizer(
- prompt, max_length=max_length, truncation=True).input_ids
+ token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
]
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
if uncond_prompt is not None:
@@ -354,33 +339,26 @@ def get_weighted_text_embeddings(
uncond_prompt = [uncond_prompt]
uncond_tokens = [
token[1:-1]
- for token in pipe.tokenizer(
- uncond_prompt, max_length=max_length, truncation=True)
- .input_ids
+ for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
]
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
# round up the longest length of tokens to a multiple of (model_max_length - 2)
max_length = max([len(token) for token in prompt_tokens])
if uncond_prompt is not None:
- max_length = max(max_length,
- max([len(token) for token in uncond_tokens]))
+ max_length = max(max_length, max([len(token) for token in uncond_tokens]))
max_embeddings_multiples = min(
max_embeddings_multiples,
- (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+ (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+ )
max_embeddings_multiples = max(1, max_embeddings_multiples)
- max_length = (pipe.tokenizer.model_max_length - 2
- ) * max_embeddings_multiples + 2
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
# pad the length of tokens and weights
# support bert tokenizer
- bos = (pipe.tokenizer.bos_token_id
- if pipe.tokenizer.bos_token_id is not None else
- pipe.tokenizer.cls_token_id)
- eos = (pipe.tokenizer.eos_token_id
- if pipe.tokenizer.eos_token_id is not None else
- pipe.tokenizer.sep_token_id)
+ bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+ eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
pad = pipe.tokenizer.pad_token_id
prompt_tokens, prompt_weights = pad_tokens_and_weights(
prompt_tokens,
@@ -390,7 +368,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
if uncond_prompt is not None:
uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -401,7 +380,8 @@ def get_weighted_text_embeddings(
eos,
pad,
no_boseos_middle=no_boseos_middle,
- chunk_length=pipe.tokenizer.model_max_length, )
+ chunk_length=pipe.tokenizer.model_max_length,
+ )
uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
# get the embeddings
@@ -409,30 +389,28 @@ def get_weighted_text_embeddings(
pipe,
prompt_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- prompt_weights = paddle.to_tensor(
- prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ prompt_weights = paddle.to_tensor(prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype)
if uncond_prompt is not None:
uncond_embeddings = get_unweighted_text_embeddings(
pipe,
uncond_tokens,
pipe.tokenizer.model_max_length,
- no_boseos_middle=no_boseos_middle, )
- uncond_weights = paddle.to_tensor(
- uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype)
+ no_boseos_middle=no_boseos_middle,
+ )
+ uncond_weights = paddle.to_tensor(uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype)
# assign weights to the prompts and normalize in the sense of mean
# TODO: should we normalize by chunk or in a whole (current implementation)?
if (not skip_parsing) and (not skip_weighting):
previous_mean = text_embeddings.mean(axis=[-2, -1])
text_embeddings *= prompt_weights.unsqueeze(-1)
- text_embeddings *= previous_mean / text_embeddings.mean(
- axis=[-2, -1], keepdim=True)
+ text_embeddings *= previous_mean / text_embeddings.mean(axis=[-2, -1], keepdim=True)
if uncond_prompt is not None:
previous_mean = uncond_embeddings.mean(axis=[-2, -1])
uncond_embeddings *= uncond_weights.unsqueeze(-1)
- uncond_embeddings *= previous_mean / uncond_embeddings.mean(
- axis=[-2, -1], keepdim=True)
+ uncond_embeddings *= previous_mean / uncond_embeddings.mean(axis=[-2, -1], keepdim=True)
if uncond_prompt is not None:
return text_embeddings, uncond_embeddings
@@ -453,9 +431,7 @@ def preprocess_mask(mask, scale_factor=8):
mask = mask.convert("L")
w, h = mask.size
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
- mask = mask.resize(
- (w // scale_factor, h // scale_factor),
- resample=PIL_INTERPOLATION["nearest"])
+ mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
mask = np.array(mask).astype(np.float32) / 255.0
mask = np.tile(mask, (4, 1, 1))
mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
@@ -464,9 +440,7 @@ def preprocess_mask(mask, scale_factor=8):
return mask
-class StableDiffusionPipelineAllinOne(DiffusionPipeline,
- TextualInversionLoaderMixin,
- LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionPipelineAllinOne(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
r"""
Pipeline for text-to-image image-to-image inpainting generation using Stable Diffusion.
@@ -497,38 +471,38 @@ class StableDiffusionPipelineAllinOne(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
- EulerDiscreteScheduler,
- EulerAncestralDiscreteScheduler,
- DPMSolverMultistepScheduler, ],
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- requires_safety_checker: bool=False, ):
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[
+ DDIMScheduler,
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ EulerDiscreteScheduler,
+ EulerAncestralDiscreteScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = False,
+ ):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -536,11 +510,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -559,12 +529,10 @@ def __init__(
f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -575,12 +543,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -592,7 +557,8 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.__init__additional__()
@@ -602,7 +568,8 @@ def __init__additional__(self):
setattr(
self,
"vae_scale_factor",
- 2**(len(self.vae.config.block_out_channels) - 1), )
+ 2 ** (len(self.vae.config.block_out_channels) - 1),
+ )
def __call__(self, *args, **kwargs):
return self.text2image(*args, **kwargs)
@@ -611,16 +578,17 @@ def text2img(self, *args, **kwargs):
return self.text2image(*args, **kwargs)
def _encode_prompt(
- self,
- prompt,
- negative_prompt,
- max_embeddings_multiples,
- no_boseos_middle,
- skip_parsing,
- skip_weighting,
- do_classifier_free_guidance,
- num_images_per_prompt,
- **kwargs, ):
+ self,
+ prompt,
+ negative_prompt,
+ max_embeddings_multiples,
+ no_boseos_middle,
+ skip_parsing,
+ skip_weighting,
+ do_classifier_free_guidance,
+ num_images_per_prompt,
+ **kwargs,
+ ):
batch_size = len(prompt) if isinstance(prompt, list) else 1
if negative_prompt is None:
@@ -631,41 +599,37 @@ def _encode_prompt(
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
pipe=self,
prompt=prompt,
- uncond_prompt=negative_prompt
- if do_classifier_free_guidance else None,
+ uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
max_embeddings_multiples=max_embeddings_multiples,
no_boseos_middle=no_boseos_middle,
skip_parsing=skip_parsing,
skip_weighting=skip_weighting,
- **kwargs, )
+ **kwargs,
+ )
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
- text_embeddings = text_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
seq_len = uncond_embeddings.shape[1]
- uncond_embeddings = uncond_embeddings.tile(
- [1, num_images_per_prompt, 1])
- uncond_embeddings = uncond_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
+ uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
+ uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
return text_embeddings
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -684,8 +648,7 @@ def prepare_extra_step_kwargs(self, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
@@ -694,61 +657,47 @@ def prepare_extra_step_kwargs(self, eta):
def check_inputs_text2img(self, prompt, height, width, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
def check_inputs_img2img_inpaint(self, prompt, strength, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [1.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
-
- def prepare_latents_text2img(self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- latents=None):
+ f" {type(callback_steps)}."
+ )
+
+ def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, latents=None):
shape = [batch_size, num_channels_latents, height // 8, width // 8]
if latents is None:
latents = paddle.randn(shape, dtype=dtype)
else:
if latents.shape != shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents
- def prepare_latents_img2img(self, image, timestep, num_images_per_prompt,
- dtype):
+ def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, dtype):
image = image.cast(dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample()
@@ -756,8 +705,7 @@ def prepare_latents_img2img(self, image, timestep, num_images_per_prompt,
b, c, h, w = init_latents.shape
init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
- init_latents = init_latents.reshape(
- [b * num_images_per_prompt, c, h, w])
+ init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
# add noise to latents using the timesteps
noise = paddle.randn(init_latents.shape, dtype=dtype)
@@ -779,8 +727,7 @@ def get_timesteps(self, num_inference_steps, strength):
return timesteps, num_inference_steps - t_start
- def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
- dtype):
+ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, dtype):
image = image.cast(dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample()
@@ -788,8 +735,7 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
b, c, h, w = init_latents.shape
init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
- init_latents = init_latents.reshape(
- [b * num_images_per_prompt, c, h, w])
+ init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
init_latents_orig = init_latents
@@ -801,27 +747,28 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
@paddle.no_grad()
def text2image(
- self,
- prompt: Union[str, List[str]],
- height: int=512,
- width: int=512,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- seed: Optional[int]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- # new add
- max_embeddings_multiples: Optional[int]=3,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ seed: Optional[int] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ # new add
+ max_embeddings_multiples: Optional[int] = 3,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -891,7 +838,8 @@ def text2image(
no_boseos_middle=no_boseos_middle,
skip_parsing=skip_parsing,
skip_weighting=skip_weighting,
- epoch_time=time.time(), )
+ epoch_time=time.time(),
+ )
paddle.seed(seed)
# 1. Check inputs. Raise error if not correct
self.check_inputs_text2img(prompt, height, width, callback_steps)
@@ -912,7 +860,8 @@ def text2image(
skip_parsing,
skip_weighting,
do_classifier_free_guidance,
- num_images_per_prompt, )
+ num_images_per_prompt,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -926,42 +875,33 @@ def text2image(
height,
width,
text_embeddings.dtype,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -970,8 +910,7 @@ def text2image(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- text_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
# 10. Convert to PIL
if output_type == "pil":
@@ -980,33 +919,33 @@ def text2image(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def img2img(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- strength: float=0.8,
- height=None,
- width=None,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- seed: Optional[int]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- # new add
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ strength: float = 0.8,
+ height=None,
+ width=None,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ seed: Optional[int] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ # new add
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -1093,7 +1032,8 @@ def img2img(
no_boseos_middle=no_boseos_middle,
skip_parsing=skip_parsing,
skip_weighting=skip_weighting,
- epoch_time=time.time(), )
+ epoch_time=time.time(),
+ )
paddle.seed(seed)
# 1. Check inputs
@@ -1115,7 +1055,8 @@ def img2img(
skip_parsing,
skip_weighting,
do_classifier_free_guidance,
- num_images_per_prompt, )
+ num_images_per_prompt,
+ )
# 4. Preprocess image
if isinstance(image, PIL.Image.Image):
@@ -1124,50 +1065,36 @@ def img2img(
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
- latents = self.prepare_latents_img2img(image, latent_timestep,
- num_images_per_prompt,
- text_embeddings.dtype)
+ latents = self.prepare_latents_img2img(image, latent_timestep, num_images_per_prompt, text_embeddings.dtype)
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -1176,8 +1103,7 @@ def img2img(
image = self.decode_latents(latents)
# 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- text_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
# 11. Convert to PIL
if output_type == "pil":
@@ -1186,34 +1112,34 @@ def img2img(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
def inpaint(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image],
- mask_image: Union[paddle.Tensor, PIL.Image.Image],
- height=None,
- width=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- seed: Optional[int]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- # new add
- max_embeddings_multiples: Optional[int]=1,
- no_boseos_middle: Optional[bool]=False,
- skip_parsing: Optional[bool]=False,
- skip_weighting: Optional[bool]=False,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
+ height=None,
+ width=None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ seed: Optional[int] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ # new add
+ max_embeddings_multiples: Optional[int] = 1,
+ no_boseos_middle: Optional[bool] = False,
+ skip_parsing: Optional[bool] = False,
+ skip_weighting: Optional[bool] = False,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -1309,7 +1235,8 @@ def inpaint(
no_boseos_middle=no_boseos_middle,
skip_parsing=skip_parsing,
skip_weighting=skip_weighting,
- epoch_time=time.time(), )
+ epoch_time=time.time(),
+ )
paddle.seed(seed)
# 1. Check inputs
@@ -1331,7 +1258,8 @@ def inpaint(
skip_parsing,
skip_weighting,
do_classifier_free_guidance,
- num_images_per_prompt, )
+ num_images_per_prompt,
+ )
if not isinstance(image, paddle.Tensor):
image = image.resize((width, height))
@@ -1343,16 +1271,14 @@ def inpaint(
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
# encode the init image into latents and scale the latents
latents, init_latents_orig, noise = self.prepare_latents_inpaint(
- image, latent_timestep, num_images_per_prompt,
- text_embeddings.dtype)
+ image, latent_timestep, num_images_per_prompt, text_embeddings.dtype
+ )
# 7. Prepare mask latent
mask = mask_image.cast(latents.dtype)
@@ -1362,41 +1288,30 @@ def inpaint(
extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=text_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# masking
- init_latents_proper = self.scheduler.add_noise(
- init_latents_orig, noise, t)
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
latents = (init_latents_proper * mask) + (latents * (1 - mask))
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -1405,8 +1320,7 @@ def inpaint(
image = self.decode_latents(latents)
# 11. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- text_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
# 12. Convert to PIL
if output_type == "pil":
@@ -1415,8 +1329,7 @@ def inpaint(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@staticmethod
def numpy_to_pil(images, **kwargs):
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index 25099d6d6c726..4e5e08168878d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -21,8 +21,7 @@
import paddle
import paddle.nn as nn
from paddle.nn import functional as F
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -100,8 +99,7 @@ def aggregate_attention(self, from_where: List[str]) -> paddle.Tensor:
attention_maps = self.get_average_attention()
for location in from_where:
for item in attention_maps[location]:
- cross_maps = item.reshape(
- [-1, self.attn_res[0], self.attn_res[1], item.shape[-1]])
+ cross_maps = item.reshape([-1, self.attn_res[0], self.attn_res[1], item.shape[-1]])
out.append(cross_maps)
out = paddle.concat(out, axis=0)
out = out.sum(0) / out.shape[0]
@@ -132,21 +130,19 @@ def __init__(self, attnstore, place_in_unet):
self.place_in_unet = place_in_unet
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None, ):
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ ):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
is_cross = encoder_hidden_states is not None
- encoder_hidden_states = (encoder_hidden_states
- if encoder_hidden_states is not None else
- hidden_states)
+ encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -160,8 +156,7 @@ def __call__(
if not attention_probs.stop_gradient:
# TODO must flatten (0, 1)
# [bs, num_heads, q_len, k_len] -> [bs*num_heads, q_len, k_len]
- self.attnstore(
- attention_probs.flatten(0, 1), is_cross, self.place_in_unet)
+ self.attnstore(attention_probs.flatten(0, 1), is_cross, self.place_in_unet)
hidden_states = paddle.matmul(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
@@ -174,8 +169,7 @@ def __call__(
return hidden_states
-class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion and Attend and Excite.
@@ -205,15 +199,16 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -239,19 +234,21 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -291,29 +288,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -321,8 +320,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -332,21 +330,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -354,47 +353,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -415,54 +410,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- indices,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ indices,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -475,22 +466,19 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- indices_is_list_ints = isinstance(indices, list) and isinstance(
- indices[0], int)
- indices_is_list_list_ints = (isinstance(indices, list) and
- isinstance(indices[0], list) and
- isinstance(indices[0][0], int))
+ indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int)
+ indices_is_list_list_ints = (
+ isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int)
+ )
if not indices_is_list_ints and not indices_is_list_list_ints:
- raise TypeError(
- "`indices` must be a list of ints or a list of a list of ints")
+ raise TypeError("`indices` must be a list of ints or a list of a list of ints")
- if (indices is None) or (indices is not None and
- not isinstance(indices, List)):
- raise ValueError(
- f"`indices` has to be a list but is {type(indices)}")
+ if (indices is None) or (indices is not None and not isinstance(indices, List)):
+ raise ValueError(f"`indices` has to be a list but is {type(indices)}")
if indices_is_list_ints:
indices_batch_size = 1
@@ -511,19 +499,21 @@ def check_inputs(
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -539,8 +529,9 @@ def prepare_latents(
@staticmethod
def _compute_max_attention_per_index(
- attention_maps: paddle.Tensor,
- indices: List[int], ) -> List[paddle.Tensor]:
+ attention_maps: paddle.Tensor,
+ indices: List[int],
+ ) -> List[paddle.Tensor]:
"""Computes the maximum attention value for each of the tokens we wish to alter."""
attention_for_text = attention_maps[:, :, 1:-1]
attention_for_text *= 100
@@ -554,38 +545,35 @@ def _compute_max_attention_per_index(
for i in indices:
image = attention_for_text[:, :, i]
smoothing = GaussianSmoothing()
- input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1),
- mode="reflect")
+ input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
image = smoothing(input).squeeze(0).squeeze(0)
# paddle.max donot support float16
max_indices_list.append(image.max())
return max_indices_list
def _aggregate_and_get_max_attention_per_token(
- self,
- indices: List[int], ):
+ self,
+ indices: List[int],
+ ):
"""Aggregates the attention for each token and computes the max activation value for each token to alter."""
attention_maps = self.attention_store.aggregate_attention(
- from_where=("up", "down", "mid"), )
+ from_where=("up", "down", "mid"),
+ )
max_attention_per_index = self._compute_max_attention_per_index(
attention_maps=attention_maps,
- indices=indices, )
+ indices=indices,
+ )
return max_attention_per_index
@staticmethod
- def _compute_loss(
- max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor:
+ def _compute_loss(max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor:
"""Computes the attend-and-excite loss using the maximum attention value for each token."""
- losses = [
- max(0, 1.0 - curr_max) for curr_max in max_attention_per_index
- ]
+ losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
loss = max(losses)
return loss
@staticmethod
- def _update_latent(latents: paddle.Tensor,
- loss: paddle.Tensor,
- step_size: float) -> paddle.Tensor:
+ def _update_latent(latents: paddle.Tensor, loss: paddle.Tensor, step_size: float) -> paddle.Tensor:
"""Update the latent according to the computed loss."""
loss.stop_gradient = False
grad_cond = paddle.autograd.grad(loss, [latents], retain_graph=True)[0]
@@ -593,15 +581,16 @@ def _update_latent(latents: paddle.Tensor,
return latents
def _perform_iterative_refinement_step(
- self,
- latents: paddle.Tensor,
- indices: List[int],
- loss: paddle.Tensor,
- threshold: float,
- text_embeddings: paddle.Tensor,
- step_size: float,
- t: int,
- max_refinement_steps: int=20, ):
+ self,
+ latents: paddle.Tensor,
+ indices: List[int],
+ loss: paddle.Tensor,
+ threshold: float,
+ text_embeddings: paddle.Tensor,
+ step_size: float,
+ t: int,
+ max_refinement_steps: int = 20,
+ ):
"""
Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code
according to our loss objective until the given threshold is reached for all tokens.
@@ -618,7 +607,8 @@ def _perform_iterative_refinement_step(
# Get max activation value for each subject token
max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
- indices=indices, )
+ indices=indices,
+ )
loss = self._compute_loss(max_attention_per_index)
@@ -628,9 +618,7 @@ def _perform_iterative_refinement_step(
logger.info(f"\t Try {iteration}. loss: {loss}")
if iteration >= max_refinement_steps:
- logger.info(
- f"\t Exceeded max number of iterations ({max_refinement_steps})! "
- )
+ logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
break
# Run one more time but don't compute gradients and update the latents.
@@ -643,7 +631,8 @@ def _perform_iterative_refinement_step(
# Get max activation value for each subject token
max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
- indices=indices, )
+ indices=indices,
+ )
loss = self._compute_loss(max_attention_per_index)
logger.info(f"\t Finished with loss of: {loss}")
return loss, latents, max_attention_per_index
@@ -662,8 +651,7 @@ def register_attention_control(self):
continue
cross_att_count += 1
- attn_procs[name] = AttendExciteAttnProcessor(
- attnstore=self.attention_store, place_in_unet=place_in_unet)
+ attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
self.unet.set_attn_processor(attn_procs)
self.attention_store.num_att_layers = cross_att_count
@@ -671,42 +659,36 @@ def register_attention_control(self):
def get_indices(self, prompt: str) -> Dict[str, int]:
"""Utility function to list the indices of the tokens you wish to alte"""
ids = self.tokenizer(prompt).input_ids
- indices = {
- i: tok
- for tok, i in zip(
- self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))
- }
+ indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
return indices
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]],
- token_indices: Union[List[int], List[List[int]]],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: int=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- max_iter_to_alter: int=25,
- thresholds: dict={0: 0.05,
- 10: 0.5,
- 20: 0.8},
- scale_factor: int=20,
- attn_res: Optional[Tuple[int]]=(16, 16), ):
+ self,
+ prompt: Union[str, List[str]],
+ token_indices: Union[List[int], List[List[int]]],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: int = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ max_iter_to_alter: int = 25,
+ thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
+ scale_factor: int = 20,
+ attn_res: Optional[Tuple[int]] = (16, 16),
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -802,7 +784,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -824,7 +807,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -839,7 +823,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -853,8 +838,9 @@ def __call__(
scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps))
step_size = scale_factor * np.sqrt(scale_range)
- text_embeddings = (prompt_embeds[batch_size * num_images_per_prompt:]
- if do_classifier_free_guidance else prompt_embeds)
+ text_embeddings = (
+ prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds
+ )
if isinstance(token_indices[0], int):
token_indices = [token_indices]
@@ -865,8 +851,7 @@ def __call__(
indices = indices + [ind] * num_images_per_prompt
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# Attend and excite process
@@ -874,8 +859,7 @@ def __call__(
latents = latents.clone().detach()
latents.stop_gradient = False
updated_latents = []
- for latent, index, text_embedding in zip(latents, indices,
- text_embeddings):
+ for latent, index, text_embedding in zip(latents, indices, text_embeddings):
# Forward pass of denoising with text conditioning
latent = latent.unsqueeze(0)
text_embedding = text_embedding.unsqueeze(0)
@@ -889,28 +873,23 @@ def __call__(
self.unet.clear_gradients()
# Get max activation value for each subject token
- max_attention_per_index = (
- self._aggregate_and_get_max_attention_per_token(
- indices=index, ))
+ max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+ indices=index,
+ )
- loss = self._compute_loss(
- max_attention_per_index=max_attention_per_index)
+ loss = self._compute_loss(max_attention_per_index=max_attention_per_index)
# If this is an iterative refinement step, verify we have reached the desired threshold for all
- if i in thresholds.keys() and loss > 1.0 - thresholds[
- i]:
- (
- loss,
- latent,
- max_attention_per_index,
- ) = self._perform_iterative_refinement_step(
+ if i in thresholds.keys() and loss > 1.0 - thresholds[i]:
+ (loss, latent, max_attention_per_index,) = self._perform_iterative_refinement_step(
latents=latent,
indices=index,
loss=loss,
threshold=thresholds[i],
text_embeddings=text_embedding,
step_size=step_size[i],
- t=t, )
+ t=t,
+ )
# Perform gradient update
if i < max_iter_to_alter:
@@ -918,41 +897,36 @@ def __call__(
latent = self._update_latent(
latents=latent,
loss=loss,
- step_size=step_size[i], )
- logger.info(
- f"Iteration {i} | Loss: {loss.item():0.4f}")
+ step_size=step_size[i],
+ )
+ logger.info(f"Iteration {i} | Loss: {loss.item():0.4f}")
updated_latents.append(latent)
latents = paddle.concat(updated_latents, axis=0)
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -961,8 +935,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
if output_type == "pil":
@@ -971,8 +944,7 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
class GaussianSmoothing(nn.Layer):
@@ -989,11 +961,12 @@ class GaussianSmoothing(nn.Layer):
# channels=1, kernel_size=kernel_size, sigma=sigma, dim=2
def __init__(
- self,
- channels: int=1,
- kernel_size: int=3,
- sigma: float=0.5,
- dim: int=2, ):
+ self,
+ channels: int = 1,
+ kernel_size: int = 3,
+ sigma: float = 0.5,
+ dim: int = 2,
+ ):
super().__init__()
if isinstance(kernel_size, int):
@@ -1004,21 +977,17 @@ def __init__(
# The gaussian kernel is the product of the
# gaussian function of each dimension.
kernel = 1
- meshgrids = paddle.meshgrid([
- paddle.arange(
- size, dtype=paddle.float32) for size in kernel_size
- ])
+ meshgrids = paddle.meshgrid([paddle.arange(size, dtype=paddle.float32) for size in kernel_size])
for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
mean = (size - 1) / 2
- kernel *= (1 / (std * math.sqrt(2 * math.pi)) *
- paddle.exp(-(((mgrid - mean) / (2 * std))**2)))
+ kernel *= 1 / (std * math.sqrt(2 * math.pi)) * paddle.exp(-(((mgrid - mean) / (2 * std)) ** 2))
# Make sure sum of values in gaussian kernel equals 1.
kernel = kernel / paddle.sum(kernel)
# Reshape to depthwise convolutional weight
kernel = kernel.reshape([1, 1, *kernel.shape])
- kernel = kernel.tile([channels, * [1] * (kernel.ndim - 1)])
+ kernel = kernel.tile([channels, *[1] * (kernel.ndim - 1)])
self.register_buffer("weight", kernel)
self.groups = channels
@@ -1030,9 +999,7 @@ def __init__(
elif dim == 3:
self.conv = F.conv3d
else:
- raise RuntimeError(
- "Only 1, 2 and 3 dimensions are supported. Received {}.".format(
- dim))
+ raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
def forward(self, input):
"""
@@ -1042,5 +1009,4 @@ def forward(self, input):
Returns:
filtered (paddle.Tensor): Filtered output.
"""
- return self.conv(
- input, weight=self.weight.cast(input.dtype), groups=self.groups)
+ return self.conv(input, weight=self.weight.cast(input.dtype), groups=self.groups)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index c46f6b8e52147..448660c4ef7c3 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -21,16 +21,14 @@
import paddle
import paddle.nn as nn
import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
from ...models.controlnet import ControlNetOutput
from ...models.modeling_utils import ModelMixin
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -88,28 +86,25 @@ class MultiControlNetModel(ModelMixin):
`ControlNetModel` as a list.
"""
- def __init__(
- self,
- controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
super().__init__()
self.nets = nn.LayerList(controlnets)
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- controlnet_cond: List[paddle.Tensor],
- conditioning_scale: List[float],
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- guess_mode: bool=False,
- return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
- for i, (
- image, scale, controlnet
- ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ controlnet_cond: List[paddle.Tensor],
+ conditioning_scale: List[float],
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guess_mode: bool = False,
+ return_dict: bool = True,
+ ) -> Union[ControlNetOutput, Tuple]:
+ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
down_samples, mid_sample = controlnet(
sample,
timestep,
@@ -121,7 +116,8 @@ def forward(
attention_mask,
cross_attention_kwargs,
guess_mode,
- return_dict, )
+ return_dict,
+ )
# merge samples
if i == 0:
@@ -129,16 +125,14 @@ def forward(
else:
down_block_res_samples = [
samples_prev + samples_curr
- for samples_prev, samples_curr in zip(
- down_block_res_samples, down_samples)
+ for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
]
mid_block_res_sample += mid_sample
return down_block_res_samples, mid_block_res_sample
-class StableDiffusionControlNetPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -174,17 +168,22 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
- ControlNetModel], MultiControlNetModel, ],
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[
+ ControlNetModel,
+ List[ControlNetModel],
+ Tuple[ControlNetModel],
+ MultiControlNetModel,
+ ],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -214,8 +213,9 @@ def __init__(
controlnet=controlnet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_vae_slicing(self):
@@ -250,13 +250,14 @@ def disable_vae_tiling(self):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
Args:
@@ -295,32 +296,36 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -328,8 +333,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -339,21 +343,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -361,50 +366,48 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- config = (self.text_encoder.config
- if isinstance(self.text_encoder.config, dict) else
- self.text_encoder.config.to_dict())
- if (config.get("use_attention_mask", None) is not None and
- config["use_attention_mask"]):
+ config = (
+ self.text_encoder.config
+ if isinstance(self.text_encoder.config, dict)
+ else self.text_encoder.config.to_dict()
+ )
+ if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- dtype=self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -425,55 +428,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- controlnet_conditioning_scale=1.0, ):
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -486,7 +485,8 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# `prompt` needs more sophisticated handling when there are multiple
# conditionings.
@@ -502,15 +502,12 @@ def check_inputs(
self.check_image(image, prompt, prompt_embeds)
elif isinstance(self.controlnet, MultiControlNetModel):
if not isinstance(image, list):
- raise TypeError(
- "For multiple controlnets: `image` must be type `list`")
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
# When `image` is a nested list:
# (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
elif any(isinstance(i, list) for i in image):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
elif len(image) != len(self.controlnet.nets):
raise ValueError(
"For multiple controlnets: `image` must have the same length as the number of controlnets."
@@ -524,35 +521,28 @@ def check_inputs(
# Check `controlnet_conditioning_scale`
if isinstance(self.controlnet, ControlNetModel):
if not isinstance(controlnet_conditioning_scale, float):
- raise TypeError(
- "For single controlnet: `controlnet_conditioning_scale` must be type `float`."
- )
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
elif isinstance(self.controlnet, MultiControlNetModel):
if isinstance(controlnet_conditioning_scale, list):
- if any(
- isinstance(i, list)
- for i in controlnet_conditioning_scale):
- raise ValueError(
- "A single batch of multiple conditionings are supported at the moment."
- )
- elif isinstance(controlnet_conditioning_scale, list) and len(
- controlnet_conditioning_scale) != len(self.controlnet.nets):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
raise ValueError(
"For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
- " the same length as the number of controlnets")
+ " the same length as the number of controlnets"
+ )
else:
assert False
def check_image(self, image, prompt, prompt_embeds):
image_is_pil = isinstance(image, PIL.Image.Image)
image_is_tensor = isinstance(image, paddle.Tensor)
- image_is_pil_list = isinstance(image, list) and isinstance(
- image[0], PIL.Image.Image)
- image_is_tensor_list = isinstance(image, list) and isinstance(
- image[0], paddle.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
- if (not image_is_pil and not image_is_tensor and
- not image_is_pil_list and not image_is_tensor_list):
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
raise TypeError(
"image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
)
@@ -579,15 +569,16 @@ def check_image(self, image, prompt, prompt_embeds):
)
def prepare_image(
- self,
- image,
- width,
- height,
- batch_size,
- num_images_per_prompt,
- dtype,
- do_classifier_free_guidance=False,
- guess_mode=False, ):
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
if not isinstance(image, paddle.Tensor):
if isinstance(image, PIL.Image.Image):
image = [image]
@@ -596,8 +587,7 @@ def prepare_image(
images = []
for image_ in image:
image_ = image_.convert("RGB")
- image_ = image_.resize(
- (width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
image_ = np.array(image_)
image_ = image_[None, :]
images.append(image_)
@@ -627,14 +617,15 @@ def prepare_image(
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -678,48 +669,47 @@ def _default_height_width(self, height, width, image):
# override DiffusionPipeline
def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- safe_serialization: bool=False,
- variant: Optional[str]=None,
- to_diffusers: bool=None, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ to_diffusers: bool = None,
+ ):
if isinstance(self.controlnet, ControlNetModel):
super().save_pretrained(
save_directory,
safe_serialization=safe_serialization,
variant=variant,
- to_diffusers=to_diffusers, )
- else:
- raise NotImplementedError(
- "Currently, the `save_pretrained()` is not implemented for Multi-ControlNet."
+ to_diffusers=to_diffusers,
)
+ else:
+ raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor],
- List[PIL.Image.Image]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- controlnet_conditioning_scale: Union[float, List[float]]=1.0,
- guess_mode: bool=False, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor], List[PIL.Image.Image]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+ guess_mode: bool = False,
+ ):
r"""
Function invoked when calling the pipeline for generation.
Args:
@@ -813,7 +803,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
- controlnet_conditioning_scale, )
+ controlnet_conditioning_scale,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -828,10 +819,8 @@ def __call__(
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
- if isinstance(self.controlnet, MultiControlNetModel) and isinstance(
- controlnet_conditioning_scale, float):
- controlnet_conditioning_scale = [controlnet_conditioning_scale
- ] * len(self.controlnet.nets)
+ if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -840,7 +829,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare image
if isinstance(self.controlnet, ControlNetModel):
@@ -852,7 +842,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
dtype=self.controlnet.dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
elif isinstance(self.controlnet, MultiControlNetModel):
images = []
@@ -865,7 +856,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
dtype=self.controlnet.dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode, )
+ guess_mode=guess_mode,
+ )
images.append(image_)
@@ -886,21 +878,19 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# controlnet(s) inference
if guess_mode and do_classifier_free_guidance:
@@ -919,20 +909,17 @@ def __call__(
controlnet_cond=image,
conditioning_scale=controlnet_conditioning_scale,
guess_mode=guess_mode,
- return_dict=False, )
+ return_dict=False,
+ )
if guess_mode and do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged.
- down_block_res_samples = [
- paddle.concat([paddle.zeros_like(d), d])
- for d in down_block_res_samples
- ]
- mid_block_res_sample = paddle.concat([
- paddle.zeros_like(mid_block_res_sample),
- mid_block_res_sample
- ])
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = paddle.concat(
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
+ )
# predict the noise residual
noise_pred = self.unet(
@@ -941,22 +928,19 @@ def __call__(
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
down_block_additional_residuals=down_block_res_samples,
- mid_block_additional_residual=mid_block_res_sample, ).sample
+ mid_block_additional_residual=mid_block_res_sample,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -969,8 +953,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
@@ -979,11 +962,9 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 4a517f2085671..9bbe0ba73588b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -20,8 +20,12 @@
import paddle
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPTextModel, CLIPTokenizer,
- DPTForDepthEstimation, DPTImageProcessor)
+from paddlenlp.transformers import (
+ CLIPTextModel,
+ CLIPTokenizer,
+ DPTForDepthEstimation,
+ DPTImageProcessor,
+)
from ...configuration_utils import FrozenDict
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -44,11 +48,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -59,8 +59,7 @@ def preprocess(image):
return image
-class StableDiffusionDepth2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-guided image to image generation using Stable Diffusion.
@@ -90,22 +89,21 @@ class StableDiffusionDepth2ImgPipeline(
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- depth_estimator: DPTForDepthEstimation,
- feature_extractor: DPTImageProcessor, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ depth_estimator: DPTForDepthEstimation,
+ feature_extractor: DPTImageProcessor,
+ ):
super().__init__()
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -116,12 +114,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -133,18 +128,20 @@ def __init__(
unet=unet,
scheduler=scheduler,
depth_estimator=depth_estimator,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -184,29 +181,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -214,8 +213,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -225,21 +223,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -247,47 +246,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -308,52 +303,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- strength,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -366,27 +358,21 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -403,8 +389,7 @@ def prepare_latents(self,
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -412,8 +397,7 @@ def prepare_latents(self,
init_latents = self.vae.config.scaling_factor * init_latents
- if (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] == 0):
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -425,12 +409,11 @@ def prepare_latents(self,
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // init_latents.shape[0]
- init_latents = paddle.concat(
- [init_latents] * additional_image_per_prompt, axis=0)
- elif (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] != 0):
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
@@ -446,8 +429,7 @@ def prepare_latents(self,
return latents
- def prepare_depth_map(self, image, depth_map, batch_size,
- do_classifier_free_guidance, dtype):
+ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype):
if isinstance(image, PIL.Image.Image):
image = [image]
else:
@@ -459,27 +441,24 @@ def prepare_depth_map(self, image, depth_map, batch_size,
height, width = image[0].shape[-2:]
if depth_map is None:
- pixel_values = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ pixel_values = self.feature_extractor(images=image, return_tensors="pd").pixel_values
# The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
# TODO DPTModel `expand_as`` donot supoort float16
with paddle.amp.auto_cast(True, level="O2"):
- depth_map = self.depth_estimator(
- pixel_values).predicted_depth.cast("float32")
+ depth_map = self.depth_estimator(pixel_values).predicted_depth.cast("float32")
else:
depth_map = depth_map.cast("float32")
depth_map = paddle.nn.functional.interpolate(
depth_map.unsqueeze(1),
- size=(height // self.vae_scale_factor,
- width // self.vae_scale_factor),
+ size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
mode="bicubic",
- align_corners=False, )
+ align_corners=False,
+ )
# amin / amax donot support float16
depth_min = paddle.amin(depth_map, axis=[1, 2, 3], keepdim=True)
depth_max = paddle.amax(depth_map, axis=[1, 2, 3], keepdim=True)
- depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min
- ) - 1.0
+ depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
# maybe cast to float16
depth_map = depth_map.cast(dtype)
@@ -488,30 +467,29 @@ def prepare_depth_map(self, image, depth_map, batch_size,
repeat_by = batch_size // depth_map.shape[0]
depth_map = depth_map.tile([repeat_by, 1, 1, 1])
- depth_map = (paddle.concat([depth_map] * 2)
- if do_classifier_free_guidance else depth_map)
+ depth_map = paddle.concat([depth_map] * 2) if do_classifier_free_guidance else depth_map
return depth_map
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- depth_map: Optional[paddle.Tensor]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ depth_map: Optional[paddle.Tensor] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -603,7 +581,8 @@ def __call__(
callback_steps,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
if image is None:
raise ValueError("`image` input cannot be undefined.")
@@ -627,7 +606,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare depth mask
depth_mask = self.prepare_depth_map(
@@ -635,17 +615,16 @@ def __call__(
depth_map,
batch_size * num_images_per_prompt,
do_classifier_free_guidance,
- prompt_embeds.dtype, )
+ prompt_embeds.dtype,
+ )
# 5. Preprocess image
image = preprocess(image)
# 6. Set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 7. Prepare latent variables
latents = self.prepare_latents(
@@ -654,44 +633,35 @@ def __call__(
batch_size,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- latent_model_input = paddle.concat(
- [latent_model_input, depth_mask], axis=1)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ latent_model_input = paddle.concat([latent_model_input, depth_mask], axis=1)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
latents = latents.cast(prompt_embeds.dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -704,6 +674,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 87ea9a04eb5f6..48556ee9e0bfb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -19,8 +19,7 @@
import paddle
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ...configuration_utils import FrozenDict
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -62,14 +61,15 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
_optional_components = ["safety_checker"]
def __init__(
- self,
- vae: AutoencoderKL,
- image_encoder: CLIPVisionModelWithProjection,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ image_encoder: CLIPVisionModelWithProjection,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -88,12 +88,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -104,12 +102,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -120,17 +115,16 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def _encode_image(self, image, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
dtype = self.image_encoder.dtype
if not isinstance(image, paddle.Tensor):
- image = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
image = image.cast(dtype)
image_embeddings = self.image_encoder(image).image_embeds
@@ -139,8 +133,7 @@ def _encode_image(self, image, num_images_per_prompt,
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
- image_embeddings = image_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
negative_prompt_embeds = paddle.zeros_like(image_embeddings)
@@ -148,19 +141,17 @@ def _encode_image(self, image, num_images_per_prompt,
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- image_embeddings = paddle.concat(
- [negative_prompt_embeds, image_embeddings])
+ image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
return image_embeddings
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -181,54 +172,56 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(self, image, height, width, callback_steps):
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
- f" {type(image)}")
+ f" {type(image)}"
+ )
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -244,21 +237,21 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -332,8 +325,7 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input image
- image_embeddings = self._encode_image(image, num_images_per_prompt,
- do_classifier_free_guidance)
+ image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance)
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -348,42 +340,33 @@ def __call__(
width,
image_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=image_embeddings).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -392,8 +375,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, image_embeddings.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
# 10. Convert to PIL
if output_type == "pil":
@@ -402,5 +384,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index b26c0e76369b2..d8bee685bc963 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -20,17 +20,20 @@
import paddle
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
- TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor,
- replace_example_docstring)
+from ...utils import (
+ PIL_INTERPOLATION,
+ deprecate,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -74,11 +77,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -89,9 +88,7 @@ def preprocess(image):
return image
-class StableDiffusionImg2ImgPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin,
- LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
r"""
Pipeline for text-guided image to image generation using Stable Diffusion.
@@ -130,37 +127,33 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline,
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -168,11 +161,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -193,12 +182,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -209,12 +196,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -226,22 +210,24 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
- self.image_processor = VaeImageProcessor(
- vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.register_to_config(
- requires_safety_checker=requires_safety_checker, )
+ requires_safety_checker=requires_safety_checker,
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -281,36 +267,37 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -320,21 +307,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -342,36 +330,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -380,17 +365,14 @@ def run_safety_checker(self, image, dtype):
has_nsfw_concept = None
else:
if paddle.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(
- image, output_type="pil")
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
- feature_extractor_input = self.image_processor.numpy_to_pil(
- image)
- safety_checker_input = self.feature_extractor(
- feature_extractor_input, return_tensors="pd")
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
images=image,
- clip_input=paddle.cast(safety_checker_input.pixel_values,
- dtype), )
+ clip_input=paddle.cast(safety_checker_input.pixel_values, dtype),
+ )
return image, has_nsfw_concept
def decode_latents(self, latents):
@@ -406,51 +388,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- strength,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -463,29 +442,21 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self,
- image,
- timestep,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
if not isinstance(image, (paddle.Tensor, list)):
- raise ValueError(
- f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}"
- )
+ raise ValueError(f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}")
image = image.cast(dtype)
@@ -498,8 +469,7 @@ def prepare_latents(self,
if isinstance(generator, list):
init_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
init_latents = paddle.concat(init_latents, axis=0)
else:
@@ -507,8 +477,7 @@ def prepare_latents(self,
init_latents = self.vae.config.scaling_factor * init_latents
- if (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] == 0):
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -520,12 +489,11 @@ def prepare_latents(self,
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // init_latents.shape[0]
- init_latents = paddle.concat(
- [init_latents] * additional_image_per_prompt, axis=0)
- elif (batch_size > init_latents.shape[0] and
- batch_size % init_latents.shape[0] != 0):
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
@@ -544,24 +512,24 @@ def prepare_latents(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -637,7 +605,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -658,17 +627,16 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Preprocess image
image = self.image_processor.preprocess(image)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
latents = self.prepare_latents(
@@ -677,51 +645,45 @@ def __call__(
batch_size,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if not output_type == "latent":
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
else:
image = latents
has_nsfw_concept = None
@@ -731,11 +693,9 @@ def __call__(
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(
- image, output_type=output_type, do_denormalize=do_denormalize)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index fb09dc473b674..f1e0347160085 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -21,8 +21,7 @@
import paddle.nn.functional as F
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -65,14 +64,11 @@ def prepare_mask_and_masked_image(image, mask):
"""
if isinstance(image, paddle.Tensor):
if not isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
- )
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
# Batch single image
if image.ndim == 3:
- assert (image.shape[0] == 3
- ), "Image outside a batch should be of shape (3, H, W)"
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
image = image.unsqueeze(0)
# Batch and add channel dim for single mask
@@ -89,12 +85,9 @@ def prepare_mask_and_masked_image(image, mask):
else:
mask = mask.unsqueeze(1)
- assert (image.ndim == 4 and
- mask.ndim == 4), "Image and Mask must have 4 dimensions"
- assert (image.shape[-2:] == mask.shape[-2:]
- ), "Image and Mask must have the same spatial dimensions"
- assert (image.shape[0] == mask.shape[0]
- ), "Image and Mask must have the same batch size"
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
# Check image is in [-1, 1]
if image.min() < -1 or image.max() > 1:
@@ -110,8 +103,7 @@ def prepare_mask_and_masked_image(image, mask):
# Image as float32
image = image.cast(paddle.float32)
elif isinstance(mask, paddle.Tensor):
- raise TypeError(
- f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
else:
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -131,8 +123,7 @@ def prepare_mask_and_masked_image(image, mask):
mask = [mask]
if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
- mask = np.concatenate(
- [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -176,49 +167,47 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "skip_prk_steps") and
- scheduler.config.skip_prk_steps is False):
+ if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration"
" `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
" sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
" incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
" Hub, it would be very nice if you could open a Pull request for the"
- " `scheduler/scheduler_config.json` file")
+ " `scheduler/scheduler_config.json` file"
+ )
deprecate(
"skip_prk_steps not set",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
new_config = dict(scheduler.config)
new_config["skip_prk_steps"] = True
scheduler._internal_dict = FrozenDict(new_config)
@@ -239,12 +228,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -255,12 +242,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -272,19 +256,21 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -320,29 +306,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -350,8 +338,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -361,14 +348,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -378,47 +367,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -430,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -454,39 +437,37 @@ def decode_latents(self, latents):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -499,18 +480,20 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -531,22 +514,20 @@ def prepare_latents(
return latents
def prepare_mask_latents(
- self,
- mask,
- masked_image,
- batch_size,
- height,
- width,
- dtype,
- generator,
- do_classifier_free_guidance, ):
+ self,
+ mask,
+ masked_image,
+ batch_size,
+ height,
+ width,
+ dtype,
+ generator,
+ do_classifier_free_guidance,
+ ):
# resize the mask to latents shape as we concatenate the mask to the latents
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
# and half precision
- mask = F.interpolate(
- mask,
- size=(height // self.vae_scale_factor,
- width // self.vae_scale_factor))
+ mask = F.interpolate(mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor))
mask = mask.cast(dtype)
masked_image = masked_image.cast(dtype)
@@ -554,13 +535,12 @@ def prepare_mask_latents(
# encode the mask image into latents space so we can concatenate it to the latents
if isinstance(generator, list):
masked_image_latents = [
- self.vae.encode(masked_image[i:i + 1]).latent_dist.sample(
- generator=generator[i]) for i in range(batch_size)
+ self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
]
masked_image_latents = paddle.concat(masked_image_latents, axis=0)
else:
- masked_image_latents = self.vae.encode(
- masked_image).latent_dist.sample(generator=generator)
+ masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -579,14 +559,12 @@ def prepare_mask_latents(
f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
" Make sure the number of images that you pass is divisible by the total requested batch size."
)
- masked_image_latents = masked_image_latents.tile(
- [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+ masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
- mask = paddle.concat([mask] *
- 2) if do_classifier_free_guidance else mask
- masked_image_latents = (paddle.concat([masked_image_latents] * 2)
- if do_classifier_free_guidance else
- masked_image_latents)
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
# aligning device to prevent device errors when concating it with the latent model input
masked_image_latents = masked_image_latents.cast(dtype)
@@ -594,26 +572,26 @@ def prepare_mask_latents(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -726,7 +704,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
if image is None:
raise ValueError("`image` input cannot be undefined.")
@@ -754,7 +733,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Preprocess mask and image
mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
@@ -772,7 +752,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Prepare mask latent variables
mask, masked_image_latents = self.prepare_mask_latents(
@@ -783,60 +764,51 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- do_classifier_free_guidance, )
+ do_classifier_free_guidance,
+ )
# 8. Check that sizes of mask, masked image and latents match
num_channels_mask = mask.shape[1]
num_channels_masked_image = masked_image_latents.shape[1]
- if (num_channels_latents + num_channels_mask + num_channels_masked_image
- != self.unet.config.in_channels):
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
raise ValueError(
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
- " `pipeline.unet` or your `mask_image` or `image` input.")
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
# 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 10. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# concat latents, mask, masked_image_latents in the channel dimension
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- latent_model_input = paddle.concat(
- [latent_model_input, mask, masked_image_latents], axis=1)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# must cast dtype, paddle.concat has bug....
latents = latents.cast(prompt_embeds.dtype)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -845,8 +817,7 @@ def __call__(
image = self.decode_latents(latents)
# 12. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 13. Convert to PIL
if output_type == "pil":
@@ -855,5 +826,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 67150c534019e..e321d55a86336 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -21,12 +21,10 @@
import paddle.nn.functional as F
import PIL
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
- TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
@@ -54,7 +52,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
mask = mask.resize(
(w // scale_factor, h // scale_factor),
- resample=PIL_INTERPOLATION["nearest"], )
+ resample=PIL_INTERPOLATION["nearest"],
+ )
mask = np.array(mask).astype(np.float32) / 255.0
mask = np.tile(mask, (4, 1, 1))
mask = np.vstack([mask[None]] * batch_size)
@@ -70,7 +69,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
elif mask.shape[1] not in valid_mask_channel_sizes:
raise ValueError(
f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
- f" but received mask of shape {tuple(mask.shape)}")
+ f" but received mask of shape {tuple(mask.shape)}"
+ )
# (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
mask = mask.mean(1, keepdim=True)
h, w = mask.shape[-2:]
@@ -79,9 +79,9 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
return mask
-class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline,
- TextualInversionLoaderMixin,
- LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionInpaintPipelineLegacy(
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
+):
r"""
Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
@@ -119,37 +119,33 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline,
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -157,11 +153,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -182,12 +174,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -198,12 +188,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -215,19 +202,21 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -267,29 +256,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -297,8 +288,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -308,21 +298,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -330,47 +321,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -391,52 +378,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- strength,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if strength < 0 or strength > 1:
- raise ValueError(
- f"The value of strength should in [0.0, 1.0] but is {strength}")
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -449,59 +433,56 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
def get_timesteps(self, num_inference_steps, strength):
# get the original timestep using init_timestep
- init_timestep = min(
- int(num_inference_steps * strength), num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
- def prepare_latents(self, image, timestep, num_images_per_prompt, dtype,
- generator):
+ def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, generator):
image = image.cast(dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = self.vae.config.scaling_factor * init_latents
# Expand init_latents for batch_size and num_images_per_prompt
- init_latents = paddle.concat(
- [init_latents] * num_images_per_prompt, axis=0)
+ init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
init_latents_orig = init_latents
# add noise to latents using the timesteps
- noise = randn_tensor(
- init_latents.shape, generator=generator, dtype=dtype)
+ noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
latents = init_latents
return latents, init_latents_orig, noise
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -580,7 +561,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -602,21 +584,19 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Preprocess image and mask
if not isinstance(image, paddle.Tensor):
image = preprocess_image(image, batch_size)
- mask_image = preprocess_mask(mask_image, batch_size,
- self.vae_scale_factor)
+ mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
- strength)
- latent_timestep = timesteps[:1].tile(
- [batch_size * num_images_per_prompt])
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
# 6. Prepare latent variables
# encode the init image into latents and scale the latents
@@ -625,7 +605,8 @@ def __call__(
latent_timestep,
num_images_per_prompt,
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
# 7. Prepare mask latent
mask = mask_image.cast(latents.dtype)
@@ -635,50 +616,39 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
if i < len(timesteps) - 1:
# masking
if add_predicted_noise:
- init_latents_proper = self.scheduler.add_noise(
- init_latents_orig, noise_pred_uncond, t)
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise_pred_uncond, t)
else:
# https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
noise_timestep = timesteps[i + 1]
- init_latents_proper = self.scheduler.add_noise(
- init_latents_orig, noise, noise_timestep)
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, noise_timestep)
else:
init_latents_proper = init_latents_orig
latents = (init_latents_proper * mask) + (latents * (1 - mask))
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -687,8 +657,7 @@ def __call__(
image = self.decode_latents(latents)
# 11. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 12. Convert to PIL
if output_type == "pil":
@@ -697,5 +666,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index f39e50878b44e..02b3128d40d82 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -18,8 +18,7 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -43,11 +42,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -58,8 +53,7 @@ def preprocess(image):
return image
-class StableDiffusionInstructPix2PixPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion.
@@ -95,15 +89,16 @@ class StableDiffusionInstructPix2PixPipeline(
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -129,30 +124,31 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- num_inference_steps: int=100,
- guidance_scale: float=7.5,
- image_guidance_scale: float=1.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ num_inference_steps: int = 100,
+ guidance_scale: float = 7.5,
+ image_guidance_scale: float = 1.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -252,7 +248,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
if image is None:
raise ValueError("`image` input cannot be undefined.")
@@ -268,8 +265,7 @@ def __call__(
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
- do_classifier_free_guidance = (guidance_scale > 1.0 and
- image_guidance_scale >= 1.0)
+ do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
# check if scheduler is in sigmas space
scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
@@ -280,7 +276,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 3. Preprocess image
image = preprocess(image)
@@ -297,7 +294,8 @@ def __call__(
num_images_per_prompt,
prompt_embeds.dtype,
do_classifier_free_guidance,
- generator, )
+ generator,
+ )
# 6. Prepare latent variables
num_channels_latents = self.vae.config.latent_channels
@@ -308,7 +306,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Check that shapes of latents and image match the UNet channels
num_channels_image = image_latents.shape[1]
@@ -318,45 +317,40 @@ def __call__(
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_image`: {num_channels_image} "
f" = {num_channels_latents+num_channels_image}. Please verify the config of"
- " `pipeline.unet` or your `image` input.")
+ " `pipeline.unet` or your `image` input."
+ )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# Expand the latents if we are doing classifier free guidance.
# The latents are expanded 3 times because for pix2pix the guidance\
# is applied for both the text and the input image.
- latent_model_input = (paddle.concat([latents] * 3) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 3) if do_classifier_free_guidance else latents
# concat latents, image_latents in the channel dimension
- scaled_latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
scaled_latent_model_input = paddle.concat(
[
scaled_latent_model_input,
image_latents.cast(scaled_latent_model_input.dtype),
],
- axis=1, )
+ axis=1,
+ )
# predict the noise residual
- noise_pred = self.unet(
- scaled_latent_model_input,
- t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# Hack:
# For karras style schedulers the model does classifer free guidance using the
# predicted_original_sample instead of the noise_pred. So we need to compute the
# predicted_original_sample here if we are using a karras style scheduler.
if scheduler_is_in_sigma_space:
- step_index = (
- self.scheduler.timesteps == t).nonzero().item()
+ step_index = (self.scheduler.timesteps == t).nonzero().item()
sigma = self.scheduler.sigmas[step_index]
noise_pred = latent_model_input - sigma * noise_pred
@@ -365,11 +359,13 @@ def __call__(
(
noise_pred_text,
noise_pred_image,
- noise_pred_uncond, ) = noise_pred.chunk(3)
- noise_pred = (noise_pred_uncond + guidance_scale *
- (noise_pred_text - noise_pred_image
- ) + image_guidance_scale *
- (noise_pred_image - noise_pred_uncond))
+ noise_pred_uncond,
+ ) = noise_pred.chunk(3)
+ noise_pred = (
+ noise_pred_uncond
+ + guidance_scale * (noise_pred_text - noise_pred_image)
+ + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+ )
# Hack:
# For karras style schedulers the model does classifer free guidance using the
@@ -381,13 +377,10 @@ def __call__(
noise_pred = (noise_pred - latents) / (-sigma)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -396,8 +389,7 @@ def __call__(
image = self.decode_latents(latents)
# 11. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 12. Convert to PIL
if output_type == "pil":
@@ -406,17 +398,17 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -456,29 +448,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -486,8 +480,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -497,21 +490,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -519,49 +513,44 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
# pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
- prompt_embeds = paddle.concat([
- prompt_embeds, negative_prompt_embeds, negative_prompt_embeds
- ])
+ prompt_embeds = paddle.concat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -573,15 +562,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -596,32 +583,32 @@ def decode_latents(self, latents):
return image
def check_inputs(
- self,
- prompt,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -634,23 +621,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -665,13 +655,14 @@ def prepare_latents(
return latents
def prepare_image_latents(
- self,
- image,
- batch_size,
- num_images_per_prompt,
- dtype,
- do_classifier_free_guidance,
- generator=None, ):
+ self,
+ image,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ do_classifier_free_guidance,
+ generator=None,
+ ):
if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -687,16 +678,12 @@ def prepare_image_latents(
)
if isinstance(generator, list):
- image_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.mode()
- for i in range(batch_size)
- ]
+ image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)]
image_latents = paddle.concat(image_latents, axis=0)
else:
image_latents = self.vae.encode(image).latent_dist.mode()
- if (batch_size > image_latents.shape[0] and
- batch_size % image_latents.shape[0] == 0):
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
# expand image_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
@@ -708,12 +695,11 @@ def prepare_image_latents(
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_image_per_prompt = batch_size // image_latents.shape[0]
- image_latents = paddle.concat(
- [image_latents] * additional_image_per_prompt, axis=0)
- elif (batch_size > image_latents.shape[0] and
- batch_size % image_latents.shape[0] != 0):
+ image_latents = paddle.concat([image_latents] * additional_image_per_prompt, axis=0)
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
)
@@ -722,7 +708,6 @@ def prepare_image_latents(
if do_classifier_free_guidance:
uncond_image_latents = paddle.zeros_like(image_latents)
- image_latents = paddle.concat(
- [image_latents, image_latents, uncond_image_latents], axis=0)
+ image_latents = paddle.concat([image_latents, image_latents, uncond_image_latents], axis=0)
return image_latents
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index f4fdd86cdbfb6..9151849ce7309 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -75,12 +75,13 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: EulerDiscreteScheduler, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: EulerDiscreteScheduler,
+ ):
super().__init__()
self.register_modules(
@@ -88,10 +89,10 @@ def __init__(
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
- def _encode_prompt(self, prompt, do_classifier_free_guidance,
- negative_prompt):
+ def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt):
r"""
Encodes the prompt into text encoder hidden states.
@@ -112,23 +113,25 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
max_length=self.tokenizer.model_max_length,
truncation=True,
return_length=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
text_encoder_out = self.text_encoder(
text_input_ids,
- output_hidden_states=True, )
+ output_hidden_states=True,
+ )
text_embeddings = text_encoder_out.hidden_states[-1]
text_pooler_out = text_encoder_out.pooler_output
@@ -140,14 +143,16 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -158,11 +163,13 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
max_length=max_length,
truncation=True,
return_length=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_encoder_out = self.text_encoder(
uncond_input.input_ids,
- output_hidden_states=True, )
+ output_hidden_states=True,
+ )
uncond_embeddings = uncond_encoder_out.hidden_states[-1]
uncond_pooler_out = uncond_encoder_out.pooler_output
@@ -170,10 +177,8 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- text_embeddings = paddle.concat(
- [uncond_embeddings, text_embeddings])
- text_pooler_out = paddle.concat(
- [uncond_pooler_out, text_pooler_out])
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
+ text_pooler_out = paddle.concat([uncond_pooler_out, text_pooler_out])
return text_embeddings, text_pooler_out
@@ -188,13 +193,13 @@ def decode_latents(self, latents):
def check_inputs(self, prompt, image, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
)
@@ -216,30 +221,30 @@ def check_inputs(self, prompt, image, callback_steps):
)
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (batch_size, num_channels_latents, height, width)
if latents is None:
latents = randn_tensor(shape, generator=generator, dtype=dtype)
else:
if latents.shape != list(shape):
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
@@ -247,19 +252,19 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
- num_inference_steps: int=75,
- guidance_scale: float=9.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
+ num_inference_steps: int = 75,
+ guidance_scale: float = 9.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -362,16 +367,14 @@ def __call__(
prompt = [""] * batch_size
# 3. Encode input prompt
- text_embeddings, text_pooler_out = self._encode_prompt(
- prompt, do_classifier_free_guidance, negative_prompt)
+ text_embeddings, text_pooler_out = self._encode_prompt(prompt, do_classifier_free_guidance, negative_prompt)
# 4. Preprocess image
image = preprocess(image)
image = image.cast(text_embeddings.dtype)
if image.shape[1] == 3:
# encode image if not in latent-space yet
- image = (self.vae.encode(image).latent_dist.sample() *
- self.vae.config.scaling_factor)
+ image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -386,27 +389,23 @@ def __call__(
# "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
noise_level = paddle.to_tensor([0.0], dtype=paddle.float32)
noise_level = paddle.concat([noise_level] * image.shape[0])
- inv_noise_level = (noise_level**2 + 1)**(-0.5)
+ inv_noise_level = (noise_level**2 + 1) ** (-0.5)
# TODO F.interpolate donot support float16
- image_cond = (F.interpolate(
- image.cast("float32"), scale_factor=2,
- mode="nearest") * inv_noise_level[:, None, None, None])
+ image_cond = (
+ F.interpolate(image.cast("float32"), scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
+ )
image_cond = image_cond.cast(text_embeddings.dtype)
noise_level_embed = paddle.concat(
[
- paddle.ones(
- [text_pooler_out.shape[0], 64],
- dtype=text_pooler_out.dtype),
- paddle.zeros(
- [text_pooler_out.shape[0], 64],
- dtype=text_pooler_out.dtype),
+ paddle.ones([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
+ paddle.zeros([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
],
- axis=1, )
+ axis=1,
+ )
- timestep_condition = paddle.concat(
- [noise_level_embed, text_pooler_out], axis=1)
+ timestep_condition = paddle.concat([noise_level_embed, text_pooler_out], axis=1)
# 6. Prepare latent variables
height, width = image.shape[2:]
@@ -418,7 +417,8 @@ def __call__(
width * 2,
text_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Check that sizes of image and latents match
num_channels_image = image.shape[1]
@@ -428,7 +428,8 @@ def __call__(
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_image`: {num_channels_image} "
f" = {num_channels_latents+num_channels_image}. Please verify the config of"
- " `pipeline.unet` or your `image` input.")
+ " `pipeline.unet` or your `image` input."
+ )
# 9. Denoising loop
num_warmup_steps = 0
@@ -437,48 +438,39 @@ def __call__(
for i, t in enumerate(timesteps):
sigma = self.scheduler.sigmas[i]
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- scaled_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
scaled_model_input = paddle.concat(
- [
- scaled_model_input,
- image_cond.cast(scaled_model_input.dtype)
- ],
- axis=1, )
+ [scaled_model_input, image_cond.cast(scaled_model_input.dtype)],
+ axis=1,
+ )
# preconditioning parameter based on Karras et al. (2022) (table 1)
timestep = paddle.log(sigma) * 0.25
noise_pred = self.unet(
scaled_model_input,
timestep,
encoder_hidden_states=text_embeddings,
- timestep_cond=timestep_condition, ).sample
+ timestep_cond=timestep_condition,
+ ).sample
# in original repo, the output contains a variance channel that's not used
noise_pred = noise_pred[:, :-1]
# apply preconditioning, based on table 1 in Karras et al. (2022)
inv_sigma = 1 / (sigma**2 + 1)
- noise_pred = (
- inv_sigma * latent_model_input +
- self.scheduler.scale_model_input(sigma, t) * noise_pred)
+ noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t,
- latents).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -491,6 +483,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
index 13e7d28b153ee..93a2487ee267a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
@@ -21,8 +21,9 @@
from ...utils import logging
from .pipeline_stable_diffusion import StableDiffusionPipeline
from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
-from .pipeline_stable_diffusion_inpaint_legacy import \
- StableDiffusionInpaintPipelineLegacy
+from .pipeline_stable_diffusion_inpaint_legacy import (
+ StableDiffusionInpaintPipelineLegacy,
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -61,36 +62,31 @@ def __call__(self, *args, **kwargs):
return self.text2img(*args, **kwargs)
def text2img(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
- expected_components = inspect.signature(
- StableDiffusionPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(StableDiffusionPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = StableDiffusionPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
output = temp_pipeline(
prompt=prompt,
height=height,
@@ -108,38 +104,34 @@ def text2img(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
return output
def img2img(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
- expected_components = inspect.signature(
- StableDiffusionImg2ImgPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ expected_components = inspect.signature(StableDiffusionImg2ImgPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = StableDiffusionImg2ImgPipeline(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
output = temp_pipeline(
prompt=prompt,
image=image,
@@ -156,41 +148,37 @@ def img2img(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- **kwargs, )
+ **kwargs,
+ )
return output
def inpaint_legacy(
- self,
- prompt: Union[str, List[str]],
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
- strength: float=0.8,
- num_inference_steps: Optional[int]=50,
- guidance_scale: Optional[float]=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- add_predicted_noise: Optional[bool]=False,
- eta: Optional[float]=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
- expected_components = inspect.signature(
- StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
+ eta: Optional[float] = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ expected_components = inspect.signature(StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = StableDiffusionInpaintPipelineLegacy(
- **components,
- requires_safety_checker=self.config.requires_safety_checker)
+ **components, requires_safety_checker=self.config.requires_safety_checker
+ )
output = temp_pipeline(
prompt=prompt,
image=image,
@@ -209,6 +197,7 @@ def inpaint_legacy(
return_dict=return_dict,
callback=callback,
callback_steps=callback_steps,
- **kwargs, )
+ **kwargs,
+ )
return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index ce7f96b22cc24..3ad5c35785e9a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -16,8 +16,7 @@
from typing import Any, Callable, Dict, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -48,8 +47,7 @@
"""
-class StableDiffusionModelEditingPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models".
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -80,22 +78,22 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: SchedulerMixin,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True,
- with_to_k: bool=True,
- with_augs: list=AUGS_CONST, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: SchedulerMixin,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ with_to_k: bool = True,
+ with_augs: list = AUGS_CONST,
+ ):
super().__init__()
if isinstance(scheduler, PNDMScheduler):
- logger.error(
- "PNDMScheduler for this pipeline is currently not supported.")
+ logger.error("PNDMScheduler for this pipeline is currently not supported.")
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -120,8 +118,9 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.with_to_k = with_to_k
@@ -147,18 +146,12 @@ def append_ca(net_):
append_ca(net[1])
# get projection matrices
- self.ca_clip_layers = [
- l for l in ca_layers if l.to_v.in_features == 768
- ]
+ self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
self.projection_matrices = [l.to_v for l in self.ca_clip_layers]
self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers]
if self.with_to_k:
- self.projection_matrices = self.projection_matrices + [
- l.to_k for l in self.ca_clip_layers
- ]
- self.og_matrices = self.og_matrices + [
- copy.deepcopy(l.to_k) for l in self.ca_clip_layers
- ]
+ self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
+ self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
def enable_vae_slicing(self):
@@ -179,13 +172,14 @@ def disable_vae_slicing(self):
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
Args:
@@ -224,29 +218,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -254,8 +250,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -265,21 +260,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -287,47 +283,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- dtype=self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -348,54 +340,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -408,23 +396,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -440,11 +431,12 @@ def prepare_latents(
@paddle.no_grad()
def edit_model(
- self,
- source_prompt: str,
- destination_prompt: str,
- lamb: float=0.1,
- restart_params: bool=True, ):
+ self,
+ source_prompt: str,
+ destination_prompt: str,
+ lamb: float = 0.1,
+ restart_params: bool = True,
+ ):
r"""
Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084)
Args:
@@ -467,20 +459,17 @@ def edit_model(
l.to_v = copy.deepcopy(self.og_matrices[idx_])
self.projection_matrices[idx_] = l.to_v
if self.with_to_k:
- l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers +
- idx_])
+ l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_])
self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k
# set up sentences
old_texts = [source_prompt]
new_texts = [destination_prompt]
# add augmentations
- base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][
- 1:]
+ base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
for aug in self.with_augs:
old_texts.append(aug + base)
- base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][
- 1:]
+ base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
for aug in self.with_augs:
new_texts.append(aug + base)
@@ -492,7 +481,8 @@ def edit_model(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_embeddings = self.text_encoder(text_input.input_ids)[0]
old_emb, new_emb = text_embeddings
old_embs.append(old_emb)
@@ -504,12 +494,12 @@ def edit_model(
tokens_a = self.tokenizer(old_text).input_ids
tokens_b = self.tokenizer(new_text).input_ids
tokens_a = [
- self.tokenizer.encode("a ")["input_ids"][1]
- if self.tokenizer.decode(t) == "an" else t for t in tokens_a
+ self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
+ for t in tokens_a
]
tokens_b = [
- self.tokenizer.encode("a ")["input_ids"][1]
- if self.tokenizer.decode(t) == "an" else t for t in tokens_b
+ self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
+ for t in tokens_b
]
num_orig_tokens = len(tokens_a)
idxs_replace = []
@@ -529,8 +519,7 @@ def edit_model(
# prepare batch: for each pair of setences, old context and new values
contexts, valuess = [], []
- for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs,
- idxs_replaces):
+ for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
context = old_emb.detach()
values = []
with paddle.no_grad():
@@ -545,52 +534,47 @@ def edit_model(
mat1 = lamb * self.projection_matrices[layer_num].weight
# mat2 = \lambda I + \sum{k k^T}
- mat2 = lamb * paddle.eye(self.projection_matrices[layer_num]
- .weight.shape[1])
+ mat2 = lamb * paddle.eye(self.projection_matrices[layer_num].weight.shape[1])
# aggregate sums for mat1, mat2
for context, values in zip(contexts, valuess):
- context_vector = context.reshape(
- [context.shape[0], context.shape[1], 1])
- context_vector_T = context.reshape(
- [context.shape[0], 1, context.shape[1]])
- value_vector = values[layer_num].reshape([
- values[layer_num].shape[0], values[layer_num].shape[1], 1
- ])
- for_mat1 = (value_vector @context_vector_T).sum(axis=0)
- for_mat2 = (context_vector @context_vector_T).sum(axis=0)
+ context_vector = context.reshape([context.shape[0], context.shape[1], 1])
+ context_vector_T = context.reshape([context.shape[0], 1, context.shape[1]])
+ value_vector = values[layer_num].reshape([values[layer_num].shape[0], values[layer_num].shape[1], 1])
+ for_mat1 = (value_vector @ context_vector_T).sum(axis=0)
+ for_mat2 = (context_vector @ context_vector_T).sum(axis=0)
mat1 += for_mat1
mat2 += for_mat2
# update projection matrix
- mat = mat1 @paddle.inverse(mat2)
- self.projection_matrices[
- layer_num].weight = paddle.create_parameter(
- shape=mat.shape,
- dtype=mat.dtype,
- default_initializer=paddle.nn.initializer.Assign(mat), )
+ mat = mat1 @ paddle.inverse(mat2)
+ self.projection_matrices[layer_num].weight = paddle.create_parameter(
+ shape=mat.shape,
+ dtype=mat.dtype,
+ default_initializer=paddle.nn.initializer.Assign(mat),
+ )
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
Args:
@@ -668,7 +652,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -690,7 +675,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -705,43 +691,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -754,8 +735,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
image = self.numpy_to_pil(image)
@@ -764,11 +744,9 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index cc2586bec5107..5258f174894bf 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -15,8 +15,7 @@
from typing import Any, Callable, Dict, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -46,8 +45,7 @@
"""
-class StableDiffusionPanoramaPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image
Generation".
@@ -81,20 +79,20 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: DDIMScheduler,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: DDIMScheduler,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if isinstance(scheduler, PNDMScheduler):
- logger.error(
- "PNDMScheduler for this pipeline is currently not supported.")
+ logger.error("PNDMScheduler for this pipeline is currently not supported.")
if safety_checker is None and requires_safety_checker:
logger.warning(
@@ -119,19 +117,21 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -171,29 +171,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -201,8 +203,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -212,21 +213,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -234,47 +236,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -295,54 +293,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -355,23 +349,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -385,11 +382,7 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- def get_views(self,
- panorama_height,
- panorama_width,
- window_size=64,
- stride=8):
+ def get_views(self, panorama_height, panorama_width, window_size=64, stride=8):
# Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
panorama_height /= 8
panorama_width /= 8
@@ -408,25 +401,25 @@ def get_views(self,
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=512,
- width: Optional[int]=2048,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = 512,
+ width: Optional[int] = 2048,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -508,7 +501,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -530,7 +524,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -545,7 +540,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Define panorama grid and initialize views for synthesis.
views = self.get_views(height, width)
@@ -558,8 +554,7 @@ def __call__(
# 8. Denoising loop
# Each denoising step also includes refinement of the latents with respect to the
# views.
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
count.zero_()
@@ -572,44 +567,39 @@ def __call__(
# MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
for h_start, h_end, w_start, w_end in views:
# get the latents corresponding to the current view coordinates
- latents_for_view = latents[:, :, h_start:h_end, w_start:
- w_end]
+ latents_for_view = latents[:, :, h_start:h_end, w_start:w_end]
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents_for_view] * 2)
- if do_classifier_free_guidance else
- latents_for_view)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = (
+ paddle.concat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view
+ )
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
latents_view_denoised = self.scheduler.step(
- noise_pred, t, latents_for_view,
- **extra_step_kwargs).prev_sample
- value[:, :, h_start:h_end, w_start:
- w_end] += latents_view_denoised
+ noise_pred, t, latents_for_view, **extra_step_kwargs
+ ).prev_sample
+ value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
count[:, :, h_start:h_end, w_start:w_end] += 1
# take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
latents = paddle.where(count > 0, value / count, value)
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -618,8 +608,7 @@ def __call__(
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
if output_type == "pil":
@@ -628,5 +617,4 @@ def __call__(
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 1ae1d85aacf36..7a5cb8d8a0a5e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -22,19 +22,33 @@
import paddle.nn.functional as F
import paddle.optimizer
import PIL
-from paddlenlp.transformers import (BlipForConditionalGeneration, BlipProcessor,
- CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import (
+ BlipForConditionalGeneration,
+ BlipProcessor,
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTokenizer,
+)
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import Attention
-from ...schedulers import (DDIMScheduler, DDPMScheduler,
- EulerAncestralDiscreteScheduler,
- LMSDiscreteScheduler)
+from ...schedulers import (
+ DDIMScheduler,
+ DDPMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+)
from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
-from ...utils import (PIL_INTERPOLATION, BaseOutput, deprecate, logging,
- randint_tensor, randn_tensor, replace_example_docstring)
+from ...utils import (
+ PIL_INTERPOLATION,
+ BaseOutput,
+ deprecate,
+ logging,
+ randint_tensor,
+ randn_tensor,
+ replace_example_docstring,
+)
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -172,11 +186,7 @@ def preprocess(image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
- image = [
- np.array(i.resize(
- (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
- for i in image
- ]
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -194,13 +204,11 @@ def prepare_unet(unet: UNet2DConditionModel):
module_name = name.replace(".processor", "")
module: nn.Layer = unet.get_sublayer(module_name)
if "attn2" in name:
- pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(
- is_pix2pix_zero=True)
+ pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True)
for params in module.parameters():
params.stop_gradient = False
else:
- pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(
- is_pix2pix_zero=False)
+ pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False)
for params in module.parameters():
params.stop_gradient = True
@@ -213,7 +221,7 @@ def __init__(self):
self.loss = 0.0
def compute_loss(self, predictions, targets):
- self.loss += ((predictions - targets)**2).sum((1, 2)).mean(0)
+ self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)
class Pix2PixZeroAttnProcessor:
@@ -226,23 +234,22 @@ def __init__(self, is_pix2pix_zero=False):
self.reference_cross_attn_map = {}
def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- timestep=None,
- loss=None, ):
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ timestep=None,
+ loss=None,
+ ):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -255,14 +262,11 @@ def __call__(
if self.is_pix2pix_zero and timestep is not None:
# new bookkeeping to save the attention weights.
if loss is None:
- self.reference_cross_attn_map[timestep.item(
- )] = attention_probs.detach().flatten(0, 1)
+ self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().flatten(0, 1)
# compute loss
elif loss is not None:
- prev_attn_probs = self.reference_cross_attn_map.pop(
- timestep.item())
- loss.compute_loss(
- attention_probs.flatten(0, 1), prev_attn_probs)
+ prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
+ loss.compute_loss(attention_probs.flatten(0, 1), prev_attn_probs)
hidden_states = paddle.matmul(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
@@ -314,20 +318,24 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: Union[DDPMScheduler, DDIMScheduler,
- EulerAncestralDiscreteScheduler,
- LMSDiscreteScheduler, ],
- feature_extractor: CLIPImageProcessor,
- safety_checker: StableDiffusionSafetyChecker,
- inverse_scheduler: DDIMInverseScheduler,
- caption_generator: BlipForConditionalGeneration,
- caption_processor: BlipProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[
+ DDPMScheduler,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ],
+ feature_extractor: CLIPImageProcessor,
+ safety_checker: StableDiffusionSafetyChecker,
+ inverse_scheduler: DDIMInverseScheduler,
+ caption_generator: BlipForConditionalGeneration,
+ caption_processor: BlipProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
if safety_checker is None and requires_safety_checker:
@@ -356,19 +364,21 @@ def __init__(
feature_extractor=feature_extractor,
caption_processor=caption_processor,
caption_generator=caption_generator,
- inverse_scheduler=inverse_scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ inverse_scheduler=inverse_scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -408,29 +418,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -438,8 +450,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -449,21 +460,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -471,47 +483,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -532,66 +540,65 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- source_embeds,
- target_embeds,
- callback_steps,
- prompt_embeds=None, ):
+ self,
+ prompt,
+ image,
+ source_embeds,
+ target_embeds,
+ callback_steps,
+ prompt_embeds=None,
+ ):
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if source_embeds is None and target_embeds is None:
- raise ValueError(
- "`source_embeds` and `target_embeds` cannot be undefined.")
+ raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -611,43 +618,38 @@ def generate_caption(self, images):
# make sure cast caption_generator position_ids dtype int64
try:
self.caption_generator.text_decoder.bert.embeddings.position_ids = (
- self.caption_generator.text_decoder.bert.embeddings.
- position_ids.cast("int64"))
+ self.caption_generator.text_decoder.bert.embeddings.position_ids.cast("int64")
+ )
except Exception:
pass
text = "a photography of"
- inputs = self.caption_processor(
- images=images, text=text, return_tensors="pd")
- inputs["pixel_values"] = inputs["pixel_values"].cast(
- self.caption_generator.dtype)
+ inputs = self.caption_processor(images=images, text=text, return_tensors="pd")
+ inputs["pixel_values"] = inputs["pixel_values"].cast(self.caption_generator.dtype)
outputs = self.caption_generator.generate(**inputs, max_length=128)[0]
# offload caption generator
- caption = self.caption_processor.batch_decode(
- outputs, skip_special_tokens=True)[0]
+ caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
return text + " " + caption
- def construct_direction(self,
- embs_source: paddle.Tensor,
- embs_target: paddle.Tensor):
+ def construct_direction(self, embs_source: paddle.Tensor, embs_target: paddle.Tensor):
"""Constructs the edit direction to steer the image generation process semantically."""
return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)
@paddle.no_grad()
- def get_embeds(self, prompt: List[str],
- batch_size: int=16) -> paddle.Tensor:
+ def get_embeds(self, prompt: List[str], batch_size: int = 16) -> paddle.Tensor:
num_prompts = len(prompt)
embeds = []
for i in range(0, num_prompts, batch_size):
- prompt_slice = prompt[i:i + batch_size]
+ prompt_slice = prompt[i : i + batch_size]
input_ids = self.tokenizer(
prompt_slice,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", ).input_ids
+ return_tensors="pd",
+ ).input_ids
embeds.append(self.text_encoder(input_ids)[0])
@@ -668,10 +670,7 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
)
if isinstance(generator, list):
- latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- for i in range(batch_size)
- ]
+ latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)]
latents = paddle.concat(latents, axis=0)
else:
latents = self.vae.encode(image).latent_dist.sample(generator)
@@ -691,10 +690,10 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
"len(prompt) != len(image)",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
additional_latents_per_image = batch_size // latents.shape[0]
- latents = paddle.concat(
- [latents] * additional_latents_per_image, axis=0)
+ latents = paddle.concat([latents] * additional_latents_per_image, axis=0)
else:
raise ValueError(
f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
@@ -704,21 +703,16 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
return latents
- def get_epsilon(self,
- model_output: paddle.Tensor,
- sample: paddle.Tensor,
- timestep: int):
+ def get_epsilon(self, model_output: paddle.Tensor, sample: paddle.Tensor, timestep: int):
pred_type = self.inverse_scheduler.config.prediction_type
alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
beta_prod_t = 1 - alpha_prod_t
if pred_type == "epsilon":
return model_output
elif pred_type == "sample":
- return (sample - alpha_prod_t**
- (0.5) * model_output) / beta_prod_t**(0.5)
+ return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
elif pred_type == "v_prediction":
- return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5
- ) * sample
+ return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
else:
raise ValueError(
f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
@@ -728,15 +722,11 @@ def auto_corr_loss(self, hidden_states, generator=None):
reg_loss = 0.0
for i in range(hidden_states.shape[0]):
for j in range(hidden_states.shape[1]):
- noise = hidden_states[i:i + 1, j:j + 1, :, :]
+ noise = hidden_states[i : i + 1, j : j + 1, :, :]
while True:
- roll_amount = randint_tensor(
- noise.shape[2] // 2, shape=(1, ),
- generator=generator).item()
- reg_loss += (noise * paddle.roll(
- noise, shifts=roll_amount, axis=2)).mean()**2
- reg_loss += (noise * paddle.roll(
- noise, shifts=roll_amount, axis=3)).mean()**2
+ roll_amount = randint_tensor(noise.shape[2] // 2, shape=(1,), generator=generator).item()
+ reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=2)).mean() ** 2
+ reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=3)).mean() ** 2
if noise.shape[2] <= 8:
break
@@ -751,29 +741,29 @@ def kl_divergence(self, hidden_states):
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Optional[Union[str, List[str]]]=None,
- image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None,
- source_embeds: paddle.Tensor=None,
- target_embeds: paddle.Tensor=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- cross_attention_guidance_amount: float=0.1,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Optional[Union[str, List[str]]] = None,
+ image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
+ source_embeds: paddle.Tensor = None,
+ target_embeds: paddle.Tensor = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ cross_attention_guidance_amount: float = 0.1,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -857,7 +847,8 @@ def __call__(
source_embeds,
target_embeds,
callback_steps,
- prompt_embeds, )
+ prompt_embeds,
+ )
# 3. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -881,7 +872,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -897,7 +889,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
latents_init = latents.clone()
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -908,37 +901,31 @@ def __call__(
self.unet = prepare_unet(self.unet)
# 7. Denoising loop where we obtain the cross-attention maps.
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs={"timestep": t}, ).sample
+ cross_attention_kwargs={"timestep": t},
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -952,15 +939,12 @@ def __call__(
# 10. Second denoising loop to generate the edited image.
latents = latents_init
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# we want to learn the latent such that it steers the generation
# process towards the edited direction, so make the make initial
@@ -969,9 +953,7 @@ def __call__(
x_in.stop_gradient = False
# optimizer
- opt = paddle.optimizer.SGD(
- parameters=[x_in],
- learning_rate=cross_attention_guidance_amount)
+ opt = paddle.optimizer.SGD(parameters=[x_in], learning_rate=cross_attention_guidance_amount)
with paddle.set_grad_enabled(True):
# initialize loss
@@ -982,8 +964,8 @@ def __call__(
x_in,
t,
encoder_hidden_states=prompt_embeds_edit.detach(),
- cross_attention_kwargs={"timestep": t,
- "loss": loss}, ).sample
+ cross_attention_kwargs={"timestep": t, "loss": loss},
+ ).sample
loss.loss.backward(retain_graph=False)
opt.step()
@@ -993,32 +975,28 @@ def __call__(
x_in.detach(),
t,
encoder_hidden_states=prompt_embeds_edit,
- cross_attention_kwargs={"timestep": None}, ).sample
+ cross_attention_kwargs={"timestep": None},
+ ).sample
latents = x_in.detach().chunk(2)[0]
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
# 11. Post-process the latents.
edited_image = self.decode_latents(latents)
# 12. Run the safety checker.
- edited_image, has_nsfw_concept = self.run_safety_checker(
- edited_image, prompt_embeds.dtype)
+ edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, prompt_embeds.dtype)
# 13. Convert to PIL.
if output_type == "pil":
@@ -1027,31 +1005,30 @@ def __call__(
if not return_dict:
return (edited_image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=edited_image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept)
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
def invert(
- self,
- prompt: Optional[str]=None,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- num_inference_steps: int=50,
- guidance_scale: float=1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- cross_attention_guidance_amount: float=0.1,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- lambda_auto_corr: float=20.0,
- lambda_kl: float=20.0,
- num_reg_steps: int=5,
- num_auto_corr_rolls: int=5, ):
+ self,
+ prompt: Optional[str] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ cross_attention_guidance_amount: float = 0.1,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ lambda_auto_corr: float = 20.0,
+ lambda_kl: float = 20.0,
+ num_reg_steps: int = 5,
+ num_auto_corr_rolls: int = 5,
+ ):
r"""
Function used to generate inverted latents given a prompt and image.
@@ -1130,8 +1107,7 @@ def invert(
image = preprocess(image)
# 4. Prepare latent variables
- latents = self.prepare_image_latents(image, batch_size, self.vae.dtype,
- generator)
+ latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, generator)
# 5. Encode input prompt
num_images_per_prompt = 1
@@ -1139,7 +1115,8 @@ def invert(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- prompt_embeds=prompt_embeds, )
+ prompt_embeds=prompt_embeds,
+ )
# 4. Prepare timesteps
self.inverse_scheduler.set_timesteps(num_inference_steps)
@@ -1150,28 +1127,25 @@ def invert(
self.unet = prepare_unet(self.unet)
# 7. Denoising loop where we obtain the cross-attention maps.
- num_warmup_steps = (
- len(timesteps) - num_inference_steps * self.inverse_scheduler.order)
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
for i, t in enumerate(timesteps[:-1]):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.inverse_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs={"timestep": t}, ).sample
+ cross_attention_kwargs={"timestep": t},
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# regularization of the noise prediction
with paddle.set_grad_enabled(True):
@@ -1182,11 +1156,9 @@ def invert(
var.stop_gradient = False
# Derive epsilon from model output before regularizing to IID standard normal
- var_epsilon = self.get_epsilon(
- var, latent_model_input.detach(), t)
+ var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
- l_ac = self.auto_corr_loss(
- var_epsilon, generator=generator)
+ l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
l_ac.backward()
grad = var.grad.detach() / num_auto_corr_rolls
@@ -1197,8 +1169,7 @@ def invert(
var.stop_gradient = False
# Derive epsilon from model output before regularizing to IID standard normal
- var_epsilon = self.get_epsilon(
- var, latent_model_input.detach(), t)
+ var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
l_kld = self.kl_divergence(var_epsilon)
l_kld.backward()
@@ -1209,13 +1180,12 @@ def invert(
noise_pred = noise_pred.detach()
# compute the previous noisy sample x_t -> x_t-1
- latents = self.inverse_scheduler.step(noise_pred, t,
- latents).prev_sample
+ latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
# call the callback, if provided
if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.inverse_scheduler.order == 0):
+ (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+ ):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -1232,5 +1202,4 @@ def invert(
if not return_dict:
return (inverted_latents, image)
- return Pix2PixInversionPipelineOutput(
- latents=inverted_latents, images=image)
+ return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 56fac99a80c30..3a8030d6a986d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -17,8 +17,7 @@
import paddle
import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -52,21 +51,20 @@ def __init__(self):
self.attention_probs = None
def __call__(
- self,
- attn,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None, ):
+ self,
+ attn,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ ):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -90,8 +88,7 @@ def __call__(
# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
-class StableDiffusionSAGPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -121,15 +118,16 @@ class StableDiffusionSAGPipeline(DiffusionPipeline,
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
self.register_modules(
@@ -139,19 +137,21 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -191,29 +191,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -221,8 +223,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -232,21 +233,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -254,47 +256,43 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
else:
has_nsfw_concept = None
return image, has_nsfw_concept
@@ -315,54 +313,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -375,23 +369,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -408,26 +405,26 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- sag_scale: float=0.75,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ sag_scale: float = 0.75,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -512,7 +509,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -538,7 +536,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -553,17 +552,16 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
store_processor = CrossAttnStoreProcessor()
- self.unet.mid_block.attentions[0].transformer_blocks[
- 0].attn1.processor = store_processor
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
map_size = None
@@ -571,28 +569,25 @@ def get_map_size(module, input, output):
nonlocal map_size
map_size = output.sample.shape[-2:]
- forward_hook = self.unet.mid_block.attentions[
- 0].register_forward_post_hook(get_map_size)
+ forward_hook = self.unet.mid_block.attentions[0].register_forward_post_hook(get_map_size)
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# perform self-attention guidance with the stored self-attentnion map
if do_self_attention_guidance:
@@ -603,23 +598,19 @@ def get_map_size(module, input, output):
# DDIM-like prediction of x0
pred_x0 = self.pred_x0(latents, noise_pred_uncond, t)
# get the stored attention maps
- uncond_attn, cond_attn = store_processor.attention_probs.chunk(
- 2)
+ uncond_attn, cond_attn = store_processor.attention_probs.chunk(2)
# self-attention-based degrading of latents
degraded_latents = self.sag_masking(
pred_x0,
uncond_attn,
map_size,
t,
- self.pred_epsilon(latents, noise_pred_uncond, t), )
+ self.pred_epsilon(latents, noise_pred_uncond, t),
+ )
uncond_emb, _ = prompt_embeds.chunk(2)
# forward and give guidance
- degraded_pred = self.unet(
- degraded_latents,
- t,
- encoder_hidden_states=uncond_emb).sample
- noise_pred += sag_scale * (
- noise_pred_uncond - degraded_pred)
+ degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample
+ noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
else:
# DDIM-like prediction of x0
pred_x0 = self.pred_x0(latents, noise_pred, t)
@@ -631,22 +622,17 @@ def get_map_size(module, input, output):
cond_attn,
map_size,
t,
- self.pred_epsilon(latents, noise_pred, t), )
+ self.pred_epsilon(latents, noise_pred, t),
+ )
# forward and give guidance
- degraded_pred = self.unet(
- degraded_latents,
- t,
- encoder_hidden_states=prompt_embeds).sample
+ degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample
noise_pred += sag_scale * (noise_pred - degraded_pred)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -656,8 +642,7 @@ def get_map_size(module, input, output):
image = self.decode_latents(latents)
# 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image,
- prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
# 10. Convert to PIL
if output_type == "pil":
@@ -666,8 +651,7 @@ def get_map_size(module, input, output):
if not return_dict:
return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
def sag_masking(self, original_latents, attn_map, map_size, t, eps):
# Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf
@@ -681,20 +665,20 @@ def sag_masking(self, original_latents, attn_map, map_size, t, eps):
attn_map = attn_map.reshape([b, h, hw1, hw2])
attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0
- attn_mask = (attn_mask.reshape([b, map_size[0], map_size[1]])
- .unsqueeze(1).tile([1, latent_channel, 1, 1])
- .cast(attn_map.dtype))
+ attn_mask = (
+ attn_mask.reshape([b, map_size[0], map_size[1]])
+ .unsqueeze(1)
+ .tile([1, latent_channel, 1, 1])
+ .cast(attn_map.dtype)
+ )
attn_mask = F.interpolate(attn_mask, (latent_h, latent_w))
# Blur according to the self-attention mask
- degraded_latents = gaussian_blur_2d(
- original_latents, kernel_size=9, sigma=1.0)
- degraded_latents = degraded_latents * attn_mask + original_latents * (
- 1 - attn_mask)
+ degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0)
+ degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask)
# Noise it again to match the noise level
- degraded_latents = self.scheduler.add_noise(
- degraded_latents, noise=eps, timesteps=t)
+ degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t)
return degraded_latents
@@ -705,20 +689,18 @@ def pred_x0(self, sample, model_output, timestep):
beta_prod_t = 1 - alpha_prod_t
if self.scheduler.config.prediction_type == "epsilon":
- pred_original_sample = (sample - beta_prod_t**
- (0.5) * model_output) / alpha_prod_t**(0.5)
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
elif self.scheduler.config.prediction_type == "sample":
pred_original_sample = model_output
elif self.scheduler.config.prediction_type == "v_prediction":
- pred_original_sample = (alpha_prod_t**0.5) * sample - (
- beta_prod_t**0.5) * model_output
+ pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
# predict V
- model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
- 0.5) * sample
+ model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
else:
raise ValueError(
f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
- " or `v_prediction`")
+ " or `v_prediction`"
+ )
return pred_original_sample
@@ -729,15 +711,14 @@ def pred_epsilon(self, sample, model_output, timestep):
if self.scheduler.config.prediction_type == "epsilon":
pred_eps = model_output
elif self.scheduler.config.prediction_type == "sample":
- pred_eps = (sample -
- (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
+ pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
elif self.scheduler.config.prediction_type == "v_prediction":
- pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5
- ) * model_output
+ pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output
else:
raise ValueError(
f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
- " or `v_prediction`")
+ " or `v_prediction`"
+ )
return pred_eps
@@ -753,12 +734,9 @@ def gaussian_blur_2d(img, kernel_size, sigma):
x_kernel = x_kernel.cast(img.dtype)
kernel2d = paddle.matmul(x_kernel[:, None], x_kernel[None, :])
- kernel2d = kernel2d.expand(
- [img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]])
+ kernel2d = kernel2d.expand([img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]])
- padding = [
- kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2
- ]
+ padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
img = F.pad(img, padding, mode="reflect")
img = F.conv2d(img, kernel2d, groups=img.shape[-3])
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 4a2ca10a74b68..85b0706b3ed80 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -37,8 +37,7 @@ def preprocess(image):
if isinstance(image[0], PIL.Image.Image):
w, h = image[0].size
- w, h = map(lambda x: x - x % 64,
- (w, h)) # resize to integer multiple of 64
+ w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
image = [np.array(i.resize((w, h)))[None, :] for i in image]
image = np.concatenate(image, axis=0)
@@ -78,20 +77,21 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- low_res_scheduler: DDPMScheduler,
- scheduler: KarrasDiffusionSchedulers,
- max_noise_level: int=350, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ low_res_scheduler: DDPMScheduler,
+ scheduler: KarrasDiffusionSchedulers,
+ max_noise_level: int = 350,
+ ):
super().__init__()
# check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
is_vae_scaling_factor_set_to_0_08333 = (
- hasattr(vae.config, "scaling_factor") and
- vae.config.scaling_factor == 0.08333)
+ hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
+ )
if not is_vae_scaling_factor_set_to_0_08333:
deprecation_message = (
"The configuration file of the vae does not contain `scaling_factor` or it is set to"
@@ -105,7 +105,8 @@ def __init__(
"wrong scaling_factor",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
vae.register_to_config(scaling_factor=0.08333)
self.register_modules(
@@ -114,18 +115,20 @@ def __init__(
tokenizer=tokenizer,
unet=unet,
low_res_scheduler=low_res_scheduler,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
self.register_to_config(max_noise_level=max_noise_level)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -161,29 +164,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -191,8 +196,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -202,14 +206,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -219,36 +225,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -259,15 +262,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -283,13 +284,13 @@ def decode_latents(self, latents):
def check_inputs(self, prompt, image, noise_level, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
)
@@ -312,34 +313,32 @@ def check_inputs(self, prompt, image, noise_level, callback_steps):
# check noise level
if noise_level > self.config.max_noise_level:
- raise ValueError(
- f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}"
- )
+ raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (batch_size, num_channels_latents, height, width)
if latents is None:
latents = randn_tensor(shape, generator=generator, dtype=dtype)
else:
if latents.shape != list(shape):
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents
# scale the initial noise by the standard deviation required by the scheduler
@@ -348,25 +347,24 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- image: Union[paddle.Tensor, PIL.Image.Image, List[
- PIL.Image.Image]]=None,
- num_inference_steps: int=75,
- guidance_scale: float=9.0,
- noise_level: int=20,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+ num_inference_steps: int = 75,
+ guidance_scale: float = 9.0,
+ noise_level: int = 20,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -472,7 +470,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Preprocess image
image = preprocess(image)
@@ -484,13 +483,11 @@ def __call__(
# 5. Add noise to image
noise_level = paddle.to_tensor([noise_level], dtype="int64")
- noise = randn_tensor(
- image.shape, generator=generator, dtype=prompt_embeds.dtype)
+ noise = randn_tensor(image.shape, generator=generator, dtype=prompt_embeds.dtype)
image = self.low_res_scheduler.add_noise(image, noise, noise_level)
batch_multiplier = 2 if do_classifier_free_guidance else 1
- image = paddle.concat([image] * batch_multiplier *
- num_images_per_prompt)
+ image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
noise_level = paddle.concat([noise_level] * image.shape[0])
# 6. Prepare latent variables
@@ -503,7 +500,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 7. Check that sizes of image and latents match
num_channels_image = image.shape[1]
@@ -513,48 +511,41 @@ def __call__(
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_image`: {num_channels_image} "
f" = {num_channels_latents+num_channels_image}. Please verify the config of"
- " `pipeline.unet` or your `image` input.")
+ " `pipeline.unet` or your `image` input."
+ )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
# concat latents, mask, masked_image_latents in the channel dimension
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
- latent_model_input = paddle.concat(
- [latent_model_input, image.cast(latent_model_input.dtype)],
- axis=1)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ latent_model_input = paddle.concat([latent_model_input, image.cast(latent_model_input.dtype)], axis=1)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- class_labels=noise_level, ).sample
+ class_labels=noise_level,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -569,6 +560,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index f89be55fdda9d..eaa7be8cb0324 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -17,8 +17,11 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import paddle
-from paddlenlp.transformers import (CLIPTextModel, CLIPTextModelWithProjection,
- CLIPTokenizer)
+from paddlenlp.transformers import (
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
from paddlenlp.transformers.clip.modeling import CLIPTextModelOutput
from ...loaders import TextualInversionLoaderMixin
@@ -26,6 +29,7 @@
from ...models.embeddings import get_timestep_embedding
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging, randn_tensor, replace_example_docstring
+
# from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -103,22 +107,23 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
vae: AutoencoderKL
def __init__(
- self,
- # prior components
- prior_tokenizer: CLIPTokenizer,
- prior_text_encoder: CLIPTextModelWithProjection,
- prior: PriorTransformer,
- prior_scheduler: KarrasDiffusionSchedulers,
- # image noising components
- image_normalizer: StableUnCLIPImageNormalizer,
- image_noising_scheduler: KarrasDiffusionSchedulers,
- # regular denoising components
- tokenizer: CLIPTokenizer,
- text_encoder: CLIPTextModelWithProjection,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- # vae
- vae: AutoencoderKL, ):
+ self,
+ # prior components
+ prior_tokenizer: CLIPTokenizer,
+ prior_text_encoder: CLIPTextModelWithProjection,
+ prior: PriorTransformer,
+ prior_scheduler: KarrasDiffusionSchedulers,
+ # image noising components
+ image_normalizer: StableUnCLIPImageNormalizer,
+ image_noising_scheduler: KarrasDiffusionSchedulers,
+ # regular denoising components
+ tokenizer: CLIPTokenizer,
+ text_encoder: CLIPTextModelWithProjection,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ # vae
+ vae: AutoencoderKL,
+ ):
super().__init__()
self.register_modules(
@@ -132,18 +137,20 @@ def __init__(
text_encoder=text_encoder,
unet=unet,
scheduler=scheduler,
- vae=vae, )
+ vae=vae,
+ )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
# Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
def _encode_prior_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
- text_attention_mask: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+ text_attention_mask: Optional[paddle.Tensor] = None,
+ ):
if text_model_output is None:
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
@@ -153,44 +160,42 @@ def _encode_prior_prompt(
max_length=self.prior_tokenizer.model_max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
text_mask = text_inputs.attention_mask
- untruncated_ids = self.prior_tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.prior_tokenizer.batch_decode(
- untruncated_ids[:, self.prior_tokenizer.model_max_length -
- 1:-1])
+ untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
)
- text_input_ids = text_input_ids[:, :self.prior_tokenizer.
- model_max_length]
+ text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
prior_text_encoder_output = self.prior_text_encoder(text_input_ids)
prompt_embeds = prior_text_encoder_output.text_embeds
- prior_text_encoder_hidden_states = (
- prior_text_encoder_output.last_hidden_state)
+ prior_text_encoder_hidden_states = prior_text_encoder_output.last_hidden_state
else:
batch_size = text_model_output[0].shape[0]
prompt_embeds, prior_text_encoder_hidden_states = (
text_model_output[0],
- text_model_output[1], )
+ text_model_output[1],
+ )
text_mask = text_attention_mask
- prompt_embeds = prompt_embeds.repeat_interleave(
- num_images_per_prompt, axis=0)
- prior_text_encoder_hidden_states = (
- prior_text_encoder_hidden_states.repeat_interleave(
- num_images_per_prompt, axis=0))
+ prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
+ prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.repeat_interleave(
+ num_images_per_prompt, axis=0
+ )
text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0)
@@ -203,46 +208,43 @@ def _encode_prior_prompt(
max_length=self.prior_tokenizer.model_max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_text_mask = uncond_input.attention_mask
- negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(
- uncond_input.input_ids)
+ negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids)
- negative_prompt_embeds = (
- negative_prompt_embeds_prior_text_encoder_output.text_embeds)
+ negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
uncond_prior_text_encoder_hidden_states = (
- negative_prompt_embeds_prior_text_encoder_output.
- last_hidden_state)
+ negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
+ )
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
seq_len = uncond_prior_text_encoder_hidden_states.shape[1]
- uncond_prior_text_encoder_hidden_states = (
- uncond_prior_text_encoder_hidden_states.tile(
- [1, num_images_per_prompt, 1]))
- uncond_prior_text_encoder_hidden_states = (
- uncond_prior_text_encoder_hidden_states.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1]))
- uncond_text_mask = uncond_text_mask.repeat_interleave(
- num_images_per_prompt, axis=0)
+ uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.tile(
+ [1, num_images_per_prompt, 1]
+ )
+ uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.reshape(
+ [batch_size * num_images_per_prompt, seq_len, -1]
+ )
+ uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
# done duplicates
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
- prior_text_encoder_hidden_states = paddle.concat([
- uncond_prior_text_encoder_hidden_states,
- prior_text_encoder_hidden_states,
- ])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+ prior_text_encoder_hidden_states = paddle.concat(
+ [
+ uncond_prior_text_encoder_hidden_states,
+ prior_text_encoder_hidden_states,
+ ]
+ )
text_mask = paddle.concat([uncond_text_mask, text_mask])
@@ -250,13 +252,14 @@ def _encode_prior_prompt(
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -296,29 +299,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -326,8 +331,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -337,21 +341,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -359,36 +364,33 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -408,15 +410,13 @@ def prepare_prior_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.prior_scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the prior_scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.prior_scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
@@ -428,40 +428,38 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- noise_level,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ noise_level,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
@@ -473,11 +471,8 @@ def check_inputs(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- if prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -488,17 +483,18 @@ def check_inputs(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- if (noise_level < 0 or noise_level >=
- self.image_noising_scheduler.config.num_train_timesteps):
+ if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
raise ValueError(
f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
)
@@ -509,20 +505,19 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler):
latents = randn_tensor(shape, generator=generator, dtype=dtype)
else:
if latents.shape != list(shape):
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents
latents = latents * scheduler.init_noise_sigma
return latents
def noise_image_embeddings(
- self,
- image_embeds: paddle.Tensor,
- noise_level: int,
- noise: Optional[paddle.Tensor]=None,
- generator: Optional[paddle.Generator]=None, ):
+ self,
+ image_embeds: paddle.Tensor,
+ noise_level: int,
+ noise: Optional[paddle.Tensor] = None,
+ generator: Optional[paddle.Generator] = None,
+ ):
"""
Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
`noise_level` increases the variance in the final un-noised images.
@@ -536,17 +531,13 @@ def noise_image_embeddings(
The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
"""
if noise is None:
- noise = randn_tensor(
- image_embeds.shape,
- generator=generator,
- dtype=image_embeds.dtype)
+ noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0])
image_embeds = self.image_normalizer.scale(image_embeds)
- image_embeds = self.image_noising_scheduler.add_noise(
- image_embeds, timesteps=noise_level, noise=noise)
+ image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
image_embeds = self.image_normalizer.unscale(image_embeds)
@@ -554,7 +545,8 @@ def noise_image_embeddings(
timesteps=noise_level,
embedding_dim=image_embeds.shape[-1],
flip_sin_to_cos=True,
- downscale_freq_shift=0, )
+ downscale_freq_shift=0,
+ )
# `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
# but we might actually be running in fp16. so we need to cast here.
@@ -568,30 +560,31 @@ def noise_image_embeddings(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- # regular denoising process args
- prompt: Optional[Union[str, List[str]]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=20,
- guidance_scale: float=10.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- noise_level: int=0,
- # prior args
- prior_num_inference_steps: int=25,
- prior_guidance_scale: float=4.0,
- prior_latents: Optional[paddle.Tensor]=None, ):
+ self,
+ # regular denoising process args
+ prompt: Optional[Union[str, List[str]]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 20,
+ guidance_scale: float = 10.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ noise_level: int = 0,
+ # prior args
+ prior_num_inference_steps: int = 25,
+ prior_guidance_scale: float = 4.0,
+ prior_latents: Optional[paddle.Tensor] = None,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -687,7 +680,8 @@ def __call__(
noise_level=noise_level,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -705,13 +699,11 @@ def __call__(
prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
# 3. Encode input prompt
- (
- prior_prompt_embeds,
- prior_text_encoder_hidden_states,
- prior_text_mask, ) = self._encode_prior_prompt(
- prompt=prompt,
- num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=prior_do_classifier_free_guidance, )
+ (prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask,) = self._encode_prior_prompt(
+ prompt=prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ do_classifier_free_guidance=prior_do_classifier_free_guidance,
+ )
# 4. Prepare prior timesteps
self.prior_scheduler.set_timesteps(prior_num_inference_steps)
@@ -724,43 +716,43 @@ def __call__(
prior_prompt_embeds.dtype,
generator,
prior_latents,
- self.prior_scheduler, )
+ self.prior_scheduler,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
- prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(
- generator, eta)
+ prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
# 7. Prior denoising loop
for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([prior_latents] * 2)
- if prior_do_classifier_free_guidance else
- prior_latents)
- latent_model_input = self.prior_scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = (
+ paddle.concat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
+ )
+ latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
predicted_image_embedding = self.prior(
latent_model_input,
timestep=t,
proj_embedding=prior_prompt_embeds,
encoder_hidden_states=prior_text_encoder_hidden_states,
- attention_mask=prior_text_mask, ).predicted_image_embedding
+ attention_mask=prior_text_mask,
+ ).predicted_image_embedding
if prior_do_classifier_free_guidance:
(
predicted_image_embedding_uncond,
predicted_image_embedding_text,
) = predicted_image_embedding.chunk(2)
- predicted_image_embedding = (
- predicted_image_embedding_uncond + prior_guidance_scale *
- (predicted_image_embedding_text -
- predicted_image_embedding_uncond))
+ predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+ predicted_image_embedding_text - predicted_image_embedding_uncond
+ )
prior_latents = self.prior_scheduler.step(
predicted_image_embedding,
timestep=t,
sample=prior_latents,
- **prior_extra_step_kwargs, ).prev_sample
+ **prior_extra_step_kwargs,
+ ).prev_sample
if callback is not None and i % callback_steps == 0:
callback(i, t, prior_latents)
@@ -783,13 +775,15 @@ def __call__(
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 9. Prepare image embeddings
image_embeds = self.noise_image_embeddings(
image_embeds=image_embeds,
noise_level=noise_level,
- generator=generator, )
+ generator=generator,
+ )
if do_classifier_free_guidance:
negative_prompt_embeds = paddle.zeros_like(image_embeds)
@@ -809,23 +803,23 @@ def __call__(
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
latents = self.prepare_latents(
shape=shape,
dtype=prompt_embeds.dtype,
generator=generator,
latents=latents,
- scheduler=self.scheduler, )
+ scheduler=self.scheduler,
+ )
# 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 13. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
@@ -833,17 +827,16 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
class_labels=image_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -856,6 +849,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 043b5a310a9de..288dccda66f3b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -18,9 +18,12 @@
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -60,8 +63,7 @@
"""
-class StableUnCLIPImg2ImgPipeline(DiffusionPipeline,
- TextualInversionLoaderMixin):
+class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
"""
Pipeline for text-guided image to image generation using stable unCLIP.
@@ -108,20 +110,21 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline,
vae: AutoencoderKL
def __init__(
- self,
- # image encoding components
- feature_extractor: CLIPImageProcessor,
- image_encoder: CLIPVisionModelWithProjection,
- # image noising components
- image_normalizer: StableUnCLIPImageNormalizer,
- image_noising_scheduler: KarrasDiffusionSchedulers,
- # regular denoising components
- tokenizer: CLIPTokenizer,
- text_encoder: CLIPTextModel,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- # vae
- vae: AutoencoderKL, ):
+ self,
+ # image encoding components
+ feature_extractor: CLIPImageProcessor,
+ image_encoder: CLIPVisionModelWithProjection,
+ # image noising components
+ image_normalizer: StableUnCLIPImageNormalizer,
+ image_noising_scheduler: KarrasDiffusionSchedulers,
+ # regular denoising components
+ tokenizer: CLIPTokenizer,
+ text_encoder: CLIPTextModel,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ # vae
+ vae: AutoencoderKL,
+ ):
super().__init__()
self.register_modules(
@@ -133,19 +136,21 @@ def __init__(
text_encoder=text_encoder,
unet=unet,
scheduler=scheduler,
- vae=vae, )
+ vae=vae,
+ )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -185,29 +190,31 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -215,8 +222,7 @@ def _encode_prompt(
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -226,21 +232,22 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
@@ -248,48 +255,46 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def _encode_image(
- self,
- image,
- batch_size,
- num_images_per_prompt,
- do_classifier_free_guidance,
- noise_level,
- generator,
- image_embeds, ):
+ self,
+ image,
+ batch_size,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ noise_level,
+ generator,
+ image_embeds,
+ ):
dtype = self.image_encoder.dtype
if isinstance(image, PIL.Image.Image):
@@ -306,8 +311,7 @@ def _encode_image(
if image_embeds is None:
if not isinstance(image, paddle.Tensor):
- image = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
image = image.cast(dtype)
image_embeds = self.image_encoder(image).image_embeds
@@ -315,7 +319,8 @@ def _encode_image(
image_embeds = self.noise_image_embeddings(
image_embeds=image_embeds,
noise_level=noise_level,
- generator=generator, )
+ generator=generator,
+ )
# duplicate image embeddings for each generation per prompt, using mps friendly method
image_embeds = image_embeds.unsqueeze(1)
@@ -350,42 +355,40 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- image,
- height,
- width,
- callback_steps,
- noise_level,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- image_embeds=None, ):
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ noise_level,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ image_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
@@ -397,11 +400,8 @@ def check_inputs(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- if prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -412,17 +412,18 @@ def check_inputs(
if type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- if (noise_level < 0 or noise_level >=
- self.image_noising_scheduler.config.num_train_timesteps):
+ if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
raise ValueError(
f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
)
@@ -438,28 +439,33 @@ def check_inputs(
)
if image is not None:
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
- f" {type(image)}")
+ f" {type(image)}"
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -475,11 +481,12 @@ def prepare_latents(
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
def noise_image_embeddings(
- self,
- image_embeds: paddle.Tensor,
- noise_level: int,
- noise: Optional[paddle.Tensor]=None,
- generator: Optional[paddle.Generator]=None, ):
+ self,
+ image_embeds: paddle.Tensor,
+ noise_level: int,
+ noise: Optional[paddle.Tensor] = None,
+ generator: Optional[paddle.Generator] = None,
+ ):
"""
Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
`noise_level` increases the variance in the final un-noised images.
@@ -493,18 +500,12 @@ def noise_image_embeddings(
The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
"""
if noise is None:
- noise = randn_tensor(
- image_embeds.shape,
- generator=generator,
- dtype=image_embeds.dtype)
- noise_level = paddle.to_tensor([noise_level] *
- image_embeds.shape[0]).reshape(
- [image_embeds.shape[0]])
+ noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
+ noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0]).reshape([image_embeds.shape[0]])
image_embeds = self.image_normalizer.scale(image_embeds)
- image_embeds = self.image_noising_scheduler.add_noise(
- image_embeds, timesteps=noise_level, noise=noise)
+ image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
image_embeds = self.image_normalizer.unscale(image_embeds)
@@ -512,7 +513,8 @@ def noise_image_embeddings(
timesteps=noise_level,
embedding_dim=image_embeds.shape[-1],
flip_sin_to_cos=True,
- downscale_freq_shift=0, )
+ downscale_freq_shift=0,
+ )
# `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
# but we might actually be running in fp16. so we need to cast here.
@@ -525,27 +527,28 @@ def noise_image_embeddings(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- image: Union[paddle.Tensor, PIL.Image.Image]=None,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=20,
- guidance_scale: float=10,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[paddle.Generator]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- noise_level: int=0,
- image_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 20,
+ guidance_scale: float = 10,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[paddle.Generator] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ noise_level: int = 0,
+ image_embeds: Optional[paddle.Tensor] = None,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -641,7 +644,8 @@ def __call__(
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- image_embeds=image_embeds, )
+ image_embeds=image_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -665,7 +669,8 @@ def __call__(
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Encoder input image
noise_level = paddle.to_tensor(noise_level)
@@ -676,7 +681,8 @@ def __call__(
do_classifier_free_guidance=do_classifier_free_guidance,
noise_level=noise_level,
generator=generator,
- image_embeds=image_embeds, )
+ image_embeds=image_embeds,
+ )
# 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -691,17 +697,16 @@ def __call__(
width=width,
dtype=prompt_embeds.dtype,
generator=generator,
- latents=latents, )
+ latents=latents,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
@@ -709,17 +714,16 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
class_labels=image_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -732,6 +736,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
index 8fa2d0f3796b1..28920a1c6de42 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
@@ -16,8 +16,11 @@
import numpy as np
import paddle
import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
- CLIPVisionModel)
+from paddlenlp.transformers import (
+ CLIPPretrainedModel,
+ CLIPVisionConfig,
+ CLIPVisionModel,
+)
from ...utils import logging
@@ -27,8 +30,7 @@
def cosine_distance(image_embeds, text_embeds):
normalized_image_embeds = F.normalize(image_embeds)
normalized_text_embeds = F.normalize(text_embeds)
- return paddle.matmul(
- normalized_image_embeds, normalized_text_embeds, transpose_y=True)
+ return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
class StableDiffusionSafetyChecker(CLIPPretrainedModel):
@@ -40,12 +42,11 @@ def __init__(self, config: CLIPVisionConfig):
self.clip = CLIPVisionModel(config)
self.vision_projection = paddle.create_parameter(
(config.hidden_size, config.projection_dim),
- dtype=paddle.get_default_dtype(), )
+ dtype=paddle.get_default_dtype(),
+ )
- self.register_buffer("concept_embeds",
- paddle.ones([17, config.projection_dim]))
- self.register_buffer("special_care_embeds",
- paddle.ones([3, config.projection_dim]))
+ self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
+ self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
self.register_buffer("concept_embeds_weights", paddle.ones([17]))
self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
@@ -56,11 +57,8 @@ def forward(self, clip_input, images):
image_embeds = paddle.matmul(pooled_output, self.vision_projection)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
- special_cos_dist = (
- cosine_distance(image_embeds, self.special_care_embeds)
- .astype("float32").numpy())
- cos_dist = (cosine_distance(
- image_embeds, self.concept_embeds).astype("float32").numpy())
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
result = []
batch_size = image_embeds.shape[0]
@@ -78,22 +76,16 @@ def forward(self, clip_input, images):
for concept_idx in range(len(special_cos_dist[0])):
concept_cos = special_cos_dist[i][concept_idx]
- concept_threshold = self.special_care_embeds_weights[
- concept_idx].item()
- result_img["special_scores"][concept_idx] = round(
- concept_cos - concept_threshold + adjustment, 3)
+ concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+ result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
if result_img["special_scores"][concept_idx] > 0:
- result_img["special_care"].append({
- concept_idx, result_img["special_scores"][concept_idx]
- })
+ result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
adjustment = 0.01
for concept_idx in range(len(cos_dist[0])):
concept_cos = cos_dist[i][concept_idx]
- concept_threshold = self.concept_embeds_weights[
- concept_idx].item()
- result_img["concept_scores"][concept_idx] = round(
- concept_cos - concept_threshold + adjustment, 3)
+ concept_threshold = self.concept_embeds_weights[concept_idx].item()
+ result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
if result_img["concept_scores"][concept_idx] > 0:
result_img["bad_concepts"].append(concept_idx)
@@ -111,34 +103,29 @@ def forward(self, clip_input, images):
if any(has_nsfw_concepts):
logger.warning(
"Potential NSFW content was detected in one or more images. A black image will be returned instead."
- " Try again with a different prompt and/or seed.")
+ " Try again with a different prompt and/or seed."
+ )
return images, has_nsfw_concepts
- def forward_fastdeploy(self,
- clip_input: paddle.Tensor,
- images: paddle.Tensor):
+ def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
pooled_output = self.clip(clip_input)[1] # pooled_output
image_embeds = paddle.matmul(pooled_output, self.vision_projection)
- special_cos_dist = cosine_distance(image_embeds,
- self.special_care_embeds)
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
cos_dist = cosine_distance(image_embeds, self.concept_embeds)
# increase this value to create a stronger `nsfw` filter
# at the cost of increasing the possibility of filtering benign images
adjustment = 0.0
- special_scores = (
- special_cos_dist - self.special_care_embeds_weights + adjustment)
+ special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
# special_scores = special_scores.round(decimals=3)
special_care = paddle.any(special_scores > 0, axis=1)
special_adjustment = special_care * 0.01
- special_adjustment = special_adjustment.unsqueeze(1).expand(
- [-1, cos_dist.shape[1]])
+ special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
- concept_scores = (cos_dist - self.concept_embeds_weights
- ) + special_adjustment
+ concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
# concept_scores = concept_scores.round(decimals=3)
has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
index dd502e817aac3..8792792dd7fc4 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
@@ -32,34 +32,38 @@ class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- embedding_dim: int=768, ):
+ self,
+ embedding_dim: int = 768,
+ ):
super().__init__()
self.mean = self.create_parameter(
(1, embedding_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
self.std = self.create_parameter(
(1, embedding_dim),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(1.0), )
+ default_initializer=nn.initializer.Constant(1.0),
+ )
def to(
- self,
- device: Optional[str]=None,
- dtype: Optional[paddle.dtype]=None, ):
+ self,
+ device: Optional[str] = None,
+ dtype: Optional[paddle.dtype] = None,
+ ):
if dtype is not None:
self.mean = self.create_parameter(
self.mean.shape,
dtype=dtype,
- default_initializer=paddle.nn.initializer.Assign(
- self.mean.numpy()), )
+ default_initializer=paddle.nn.initializer.Assign(self.mean.numpy()),
+ )
self.std = self.create_parameter(
self.std.shape,
dtype=dtype,
- default_initializer=paddle.nn.initializer.Assign(self.std.numpy(
- )), )
+ default_initializer=paddle.nn.initializer.Assign(self.std.numpy()),
+ )
if device is not None:
self.mean._to(device)
self.std._to(device)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 82b88765d936c..b2c27a601a306 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -20,8 +20,7 @@
import numpy as np
import paddle
from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...models import AutoencoderKL, UNet2DConditionModel
@@ -67,41 +66,38 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: SafeStableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: SafeStableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__()
safety_concept: Optional[str] = (
"an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
" bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
- " abuse, brutality, cruelty")
+ " abuse, brutality, cruelty"
+ )
- if (hasattr(scheduler.config, "steps_offset") and
- scheduler.config.steps_offset != 1):
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
- " file")
- deprecate(
- "steps_offset!=1",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["steps_offset"] = 1
scheduler._internal_dict = FrozenDict(new_config)
- if (hasattr(scheduler.config, "clip_sample") and
- scheduler.config.clip_sample is True):
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
deprecation_message = (
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -109,11 +105,7 @@ def __init__(
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
)
- deprecate(
- "clip_sample not set",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(scheduler.config)
new_config["clip_sample"] = False
scheduler._internal_dict = FrozenDict(new_config)
@@ -134,12 +126,10 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
- is_unet_version_less_0_9_0 = hasattr(
- unet.config, "_ppdiffusers_version") and version.parse(
- version.parse(unet.config._ppdiffusers_version)
- .base_version) < version.parse("0.9.0.dev0")
- is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
- unet.config.sample_size < 64)
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+ version.parse(unet.config._ppdiffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
deprecation_message = (
"The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -150,12 +140,9 @@ def __init__(
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
" in the config might lead to incorrect results in future versions. If you have downloaded this"
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
- " the `unet/config.json` file")
- deprecate(
- "sample_size<64",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
@@ -167,9 +154,10 @@ def __init__(
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
- feature_extractor=feature_extractor, )
+ feature_extractor=feature_extractor,
+ )
self._safety_text_concept = safety_concept
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
@property
@@ -194,12 +182,13 @@ def safety_concept(self, concept):
self._safety_text_concept = concept
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt,
- enable_safety_guidance, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ enable_safety_guidance,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -221,35 +210,35 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = prompt_embeds[0]
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
@@ -259,14 +248,16 @@ def _encode_prompt(
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -276,25 +267,24 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
negative_prompt_embeds = negative_prompt_embeds[0]
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# Encode the safety concept text
if enable_safety_guidance:
@@ -303,40 +293,35 @@ def _encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
- safety_embeddings = self.text_encoder(
- safety_concept_input.input_ids)[0]
+ return_tensors="pd",
+ )
+ safety_embeddings = self.text_encoder(safety_concept_input.input_ids)[0]
# duplicate safety embeddings for each generation per prompt, using mps friendly method
seq_len = safety_embeddings.shape[1]
- safety_embeddings = safety_embeddings.tile(
- [batch_size, num_images_per_prompt, 1])
- safety_embeddings = safety_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ safety_embeddings = safety_embeddings.tile([batch_size, num_images_per_prompt, 1])
+ safety_embeddings = safety_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance + sld, we need to do three forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing three forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds, safety_embeddings])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
else:
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def run_safety_checker(self, image, dtype, enable_safety_guidance):
if self.safety_checker is not None:
images = image.copy()
- safety_checker_input = self.feature_extractor(
- self.numpy_to_pil(image), return_tensors="pd")
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
image, has_nsfw_concept = self.safety_checker(
- images=image,
- clip_input=safety_checker_input.pixel_values.cast(dtype))
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+ )
flagged_images = np.zeros((2, *image.shape[1:]))
if any(has_nsfw_concept):
logger.warning(
@@ -369,54 +354,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -429,23 +410,26 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -460,49 +444,48 @@ def prepare_latents(
return latents
def perform_safety_guidance(
- self,
- enable_safety_guidance,
- safety_momentum,
- noise_guidance,
- noise_pred_out,
- i,
- sld_guidance_scale,
- sld_warmup_steps,
- sld_threshold,
- sld_momentum_scale,
- sld_mom_beta, ):
+ self,
+ enable_safety_guidance,
+ safety_momentum,
+ noise_guidance,
+ noise_pred_out,
+ i,
+ sld_guidance_scale,
+ sld_warmup_steps,
+ sld_threshold,
+ sld_momentum_scale,
+ sld_mom_beta,
+ ):
# Perform SLD guidance
if enable_safety_guidance:
if safety_momentum is None:
safety_momentum = paddle.zeros_like(noise_guidance)
- noise_pred_text, noise_pred_uncond = noise_pred_out[
- 0], noise_pred_out[1]
+ noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
noise_pred_safety_concept = noise_pred_out[2]
# Equation 6
scale = paddle.clip(
- paddle.abs((noise_pred_text - noise_pred_safety_concept)) *
- sld_guidance_scale,
- max=1.0, )
+ paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale,
+ max=1.0,
+ )
# Equation 6
safety_concept_scale = paddle.where(
(noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
paddle.zeros_like(scale),
- scale, )
+ scale,
+ )
# Equation 4
noise_guidance_safety = paddle.multiply(
- (noise_pred_safety_concept - noise_pred_uncond),
- safety_concept_scale)
+ (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
+ )
# Equation 7
- noise_guidance_safety = (
- noise_guidance_safety + sld_momentum_scale * safety_momentum)
+ noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
# Equation 8
- safety_momentum = (sld_mom_beta * safety_momentum +
- (1 - sld_mom_beta) * noise_guidance_safety)
+ safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
if i >= sld_warmup_steps: # Warmup
# Equation 3
@@ -511,27 +494,27 @@ def perform_safety_guidance(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- sld_guidance_scale: Optional[float]=1000,
- sld_warmup_steps: Optional[int]=10,
- sld_threshold: Optional[float]=0.01,
- sld_momentum_scale: Optional[float]=0.3,
- sld_mom_beta: Optional[float]=0.4, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ sld_guidance_scale: Optional[float] = 1000,
+ sld_warmup_steps: Optional[int] = 10,
+ sld_threshold: Optional[float] = 0.01,
+ sld_momentum_scale: Optional[float] = 0.3,
+ sld_mom_beta: Optional[float] = 0.4,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -620,8 +603,7 @@ def __call__(
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
- enable_safety_guidance = (sld_guidance_scale > 1.0 and
- do_classifier_free_guidance)
+ enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
if not enable_safety_guidance:
warnings.warn("Safety checker disabled!")
@@ -631,7 +613,8 @@ def __call__(
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
- enable_safety_guidance, )
+ enable_safety_guidance,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -646,36 +629,35 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
safety_momentum = None
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(
- [latents] * (3 if enable_safety_guidance else 2)) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = (
+ paddle.concat([latents] * (3 if enable_safety_guidance else 2))
+ if do_classifier_free_guidance
+ else latents
+ )
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(
- latent_model_input, t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:
- noise_pred_out = noise_pred.chunk(
- (3 if enable_safety_guidance else 2))
+ noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
noise_pred_uncond, noise_pred_text = (
noise_pred_out[0],
- noise_pred_out[1], )
+ noise_pred_out[1],
+ )
# default classifier free guidance
noise_guidance = noise_pred_text - noise_pred_uncond
@@ -688,32 +670,28 @@ def __call__(
# Equation 6
scale = paddle.clip(
- paddle.abs(
- (noise_pred_text - noise_pred_safety_concept)) *
- sld_guidance_scale,
- max=1.0, )
+ paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale,
+ max=1.0,
+ )
# Equation 6
safety_concept_scale = paddle.where(
- (noise_pred_text - noise_pred_safety_concept) >=
- sld_threshold,
+ (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
paddle.zeros_like(scale),
- scale, )
+ scale,
+ )
# Equation 4
noise_guidance_safety = paddle.multiply(
(noise_pred_safety_concept - noise_pred_uncond),
- safety_concept_scale, )
+ safety_concept_scale,
+ )
# Equation 7
- noise_guidance_safety = (
- noise_guidance_safety + sld_momentum_scale *
- safety_momentum)
+ noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
# Equation 8
- safety_momentum = (
- sld_mom_beta * safety_momentum +
- (1 - sld_mom_beta) * noise_guidance_safety)
+ safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
if i >= sld_warmup_steps: # Warmup
# Equation 3
@@ -722,13 +700,10 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -738,7 +713,8 @@ def __call__(
# 9. Run safety checker
image, has_nsfw_concept, flagged_images = self.run_safety_checker(
- image, prompt_embeds.dtype, enable_safety_guidance)
+ image, prompt_embeds.dtype, enable_safety_guidance
+ )
# 10. Convert to PIL
if output_type == "pil":
@@ -751,11 +727,12 @@ def __call__(
image,
has_nsfw_concept,
self._safety_text_concept if enable_safety_guidance else None,
- flagged_images, )
+ flagged_images,
+ )
return StableDiffusionSafePipelineOutput(
images=image,
nsfw_content_detected=has_nsfw_concept,
- applied_safety_concept=self._safety_text_concept
- if enable_safety_guidance else None,
- unsafe_images=flagged_images, )
+ applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
+ unsafe_images=flagged_images,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
index ceae2727162f5..43772eac7c2cb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
@@ -15,8 +15,11 @@
import paddle
import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
- CLIPVisionModel)
+from paddlenlp.transformers import (
+ CLIPPretrainedModel,
+ CLIPVisionConfig,
+ CLIPVisionModel,
+)
from ...utils import logging
@@ -26,8 +29,7 @@
def cosine_distance(image_embeds, text_embeds):
normalized_image_embeds = F.normalize(image_embeds)
normalized_text_embeds = F.normalize(text_embeds)
- return paddle.matmul(
- normalized_image_embeds, normalized_text_embeds, transpose_y=True)
+ return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
class SafeStableDiffusionSafetyChecker(CLIPPretrainedModel):
@@ -39,12 +41,11 @@ def __init__(self, config: CLIPVisionConfig):
self.vision_projection = paddle.create_parameter(
(config.hidden_size, config.projection_dim),
- dtype=paddle.get_default_dtype(), )
+ dtype=paddle.get_default_dtype(),
+ )
- self.register_buffer("concept_embeds",
- paddle.ones([17, config.projection_dim]))
- self.register_buffer("special_care_embeds",
- paddle.ones([3, config.projection_dim]))
+ self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
+ self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
self.register_buffer("concept_embeds_weights", paddle.ones([17]))
self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
@@ -55,11 +56,8 @@ def forward(self, clip_input, images):
image_embeds = paddle.matmul(pooled_output, self.vision_projection)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
- special_cos_dist = (
- cosine_distance(image_embeds, self.special_care_embeds)
- .astype("float32").numpy())
- cos_dist = (cosine_distance(
- image_embeds, self.concept_embeds).astype("float32").numpy())
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
result = []
batch_size = image_embeds.shape[0]
@@ -77,22 +75,16 @@ def forward(self, clip_input, images):
for concept_idx in range(len(special_cos_dist[0])):
concept_cos = special_cos_dist[i][concept_idx]
- concept_threshold = self.special_care_embeds_weights[
- concept_idx].item()
- result_img["special_scores"][concept_idx] = round(
- concept_cos - concept_threshold + adjustment, 3)
+ concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+ result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
if result_img["special_scores"][concept_idx] > 0:
- result_img["special_care"].append({
- concept_idx, result_img["special_scores"][concept_idx]
- })
+ result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
adjustment = 0.01
for concept_idx in range(len(cos_dist[0])):
concept_cos = cos_dist[i][concept_idx]
- concept_threshold = self.concept_embeds_weights[
- concept_idx].item()
- result_img["concept_scores"][concept_idx] = round(
- concept_cos - concept_threshold + adjustment, 3)
+ concept_threshold = self.concept_embeds_weights[concept_idx].item()
+ result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
if result_img["concept_scores"][concept_idx] > 0:
result_img["bad_concepts"].append(concept_idx)
@@ -102,30 +94,24 @@ def forward(self, clip_input, images):
return images, has_nsfw_concepts
- def forward_fastdeploy(self,
- clip_input: paddle.Tensor,
- images: paddle.Tensor):
+ def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
pooled_output = self.clip(clip_input)[1] # pooled_output
image_embeds = paddle.matmul(pooled_output, self.vision_projection)
- special_cos_dist = cosine_distance(image_embeds,
- self.special_care_embeds)
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
cos_dist = cosine_distance(image_embeds, self.concept_embeds)
# increase this value to create a stronger `nsfw` filter
# at the cost of increasing the possibility of filtering benign images
adjustment = 0.0
- special_scores = (
- special_cos_dist - self.special_care_embeds_weights + adjustment)
+ special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
# special_scores = special_scores.round(decimals=3)
special_care = paddle.any(special_scores > 0, axis=1)
special_adjustment = special_care * 0.01
- special_adjustment = special_adjustment.unsqueeze(1).expand(
- [-1, cos_dist.shape[1]])
+ special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
- concept_scores = (cos_dist - self.concept_embeds_weights
- ) + special_adjustment
+ concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
# concept_scores = concept_scores.round(decimals=3)
has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
index acd0aad93d9ee..d06ace3696225 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -48,14 +48,14 @@ def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- num_inference_steps: int=50,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs, ) -> Union[Tuple, ImagePipelineOutput]:
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 50,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[Tuple, ImagePipelineOutput]:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -82,8 +82,7 @@ def __call__(
model = self.unet
# sample x_0 ~ N(0, sigma_0^2 * I)
- sample = (randn_tensor(
- shape, generator=generator) * self.scheduler.init_noise_sigma)
+ sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
self.scheduler.set_timesteps(num_inference_steps)
@@ -94,31 +93,28 @@ def __call__(
# 1. Select temporarily increased noise level sigma_hat
# 2. Add new noise to move from sample_i to sample_hat
- sample_hat, sigma_hat = self.scheduler.add_noise_to_input(
- sample, sigma, generator=generator)
+ sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
# 3. Predict the noise residual given the noise magnitude `sigma_hat`
# The model inputs and output are adjusted by following eq. (213) in [1].
- model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2,
- sigma_hat / 2).sample
+ model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
# 4. Evaluate dx/dt at sigma_hat
# 5. Take Euler step from sigma to sigma_prev
- step_output = self.scheduler.step(model_output, sigma_hat,
- sigma_prev, sample_hat)
+ step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
if sigma_prev != 0:
# 6. Apply 2nd order correction
# The model inputs and output are adjusted by following eq. (213) in [1].
- model_output = (sigma_prev / 2) * model(
- (step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
+ model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
step_output = self.scheduler.step_correct(
model_output,
sigma_hat,
sigma_prev,
sample_hat,
step_output.prev_sample,
- step_output["derivative"], )
+ step_output["derivative"],
+ )
sample = step_output.prev_sample
sample = (sample / 2 + 0.5).clip(0, 1)
@@ -127,6 +123,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
index 2ab0f9892a8b6..649c39a7ecdad 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -19,8 +19,12 @@
import numpy as np
import paddle
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
- is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+ BaseOutput,
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
@dataclass
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index de047ee797c85..8ecc3b2759f33 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -47,8 +47,7 @@
"""
-def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5],
- std=[0.5, 0.5, 0.5]) -> List[np.ndarray]:
+def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]:
# This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
# reshape to ncfhw
mean = paddle.to_tensor(mean).reshape((1, -1, 1, 1, 1))
@@ -85,29 +84,32 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet3DConditionModel,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet3DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
"""
Encodes the prompt into text encoder hidden states.
@@ -145,32 +147,30 @@ def _encode_prompt(
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
)
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
- prompt_embeds = self.text_encoder(
- text_input_ids, attention_mask=attention_mask)
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- (bs_embed * num_images_per_prompt, seq_len, -1))
+ prompt_embeds = prompt_embeds.reshape((bs_embed * num_images_per_prompt, seq_len, -1))
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
uncond_tokens: List[str]
@@ -191,48 +191,41 @@ def _encode_prompt(
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
- self.tokenizer)
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ return_tensors="pd",
+ )
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
- negative_prompt_embeds = self.text_encoder(
- uncond_input.input_ids, attention_mask=attention_mask)
+ negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
negative_prompt_embeds = negative_prompt_embeds[0]
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.cast(
- self.text_encoder.dtype)
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- (batch_size * num_images_per_prompt, seq_len, -1))
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
- latents = latents.transpose([0, 2, 1, 3, 4]).reshape(
- (batch_size * num_frames, channels, height, width))
+ latents = latents.transpose([0, 2, 1, 3, 4]).reshape((batch_size * num_frames, channels, height, width))
image = self.vae.decode(latents).sample
- video = (image[None, :]
- .reshape((batch_size, num_frames, -1) + tuple(image.shape[2:]))
- .transpose([0, 2, 1, 3, 4]))
+ video = (
+ image[None, :].reshape((batch_size, num_frames, -1) + tuple(image.shape[2:])).transpose([0, 2, 1, 3, 4])
+ )
video = video.cast("float32")
return video
@@ -241,33 +234,33 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
- if (callback_steps is None or callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+ if (
+ callback_steps is None
+ or callback_steps is not None
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
)
@@ -279,11 +272,8 @@ def check_inputs(
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -295,21 +285,23 @@ def check_inputs(
)
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- num_frames,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ num_frames,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
num_frames,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -323,25 +315,25 @@ def prepare_latents(
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_frames: int=16,
- num_inference_steps: int=50,
- guidance_scale: float=9.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="np",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: int=1,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_frames: int = 16,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 9.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "np",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -423,7 +415,8 @@ def __call__(
callback_steps,
negative_prompt,
prompt_embeds,
- negative_prompt_embeds, )
+ negative_prompt_embeds,
+ )
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -442,7 +435,8 @@ def __call__(
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -459,48 +453,38 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# reshape latents
bsz, channel, frames, width, height = latents.shape
- latents = latents.transpose([0, 2, 1, 3, 4]).reshape(
- (bsz * frames, channel, width, height))
- noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape(
- (bsz * frames, channel, width, height))
+ latents = latents.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
+ noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
- latents = (latents[None, :].reshape(
- (bsz, frames, channel, width, height))
- .transpose([0, 2, 1, 3, 4]))
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = latents[None, :].reshape((bsz, frames, channel, width, height)).transpose([0, 2, 1, 3, 4])
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -510,5 +494,5 @@ def __call__(
else:
video = tensor2vid(video_tensor)
if not return_dict:
- return (video, )
+ return (video,)
return TextToVideoSDPipelineOutput(frames=video)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index 5f9ccbe235000..106382dceb106 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -20,31 +20,26 @@
import paddle
import paddle.nn.functional as F
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
from ppdiffusers.pipelines.stable_diffusion import (
- StableDiffusionPipeline, StableDiffusionSafetyChecker)
+ StableDiffusionPipeline,
+ StableDiffusionSafetyChecker,
+)
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
from ppdiffusers.utils import BaseOutput
def rearrange_0(tensor, f):
F, C, H, W = tensor.shape
- tensor = paddle.transpose(
- x=paddle.reshape(
- x=tensor, shape=(F // f, f, C, H, W)),
- perm=(0, 2, 1, 3, 4))
+ tensor = paddle.transpose(x=paddle.reshape(x=tensor, shape=(F // f, f, C, H, W)), perm=(0, 2, 1, 3, 4))
return tensor
def rearrange_1(tensor):
B, C, F, H, W = tensor.shape
- return paddle.reshape(
- x=paddle.transpose(
- x=tensor, perm=(0, 2, 1, 3, 4)),
- shape=(B * F, C, H, W))
+ return paddle.reshape(x=paddle.transpose(x=tensor, perm=(0, 2, 1, 3, 4)), shape=(B * F, C, H, W))
def rearrange_3(tensor, f):
@@ -70,21 +65,15 @@ class CrossFrameAttnProcessor:
def __init__(self, batch_size=2):
self.batch_size = batch_size
- def __call__(self,
- attn,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None):
+ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
is_cross_attention = encoder_hidden_states is not None
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(
- encoder_hidden_states)
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
@@ -144,10 +133,10 @@ def warp_single_latent(latent, reference_flow):
if isinstance(latent.dtype, paddle.dtype):
dtype = latent.dtype
elif isinstance(latent.dtype, str) and latent.dtype not in [
- "cpu",
- "cuda",
- "ipu",
- "xpu",
+ "cpu",
+ "cuda",
+ "ipu",
+ "xpu",
]:
dtype = latent.dtype
elif isinstance(latent.dtype, paddle.Tensor):
@@ -161,13 +150,11 @@ def warp_single_latent(latent, reference_flow):
coords_t0 = coords_t0 * 2.0 - 1.0
coords_t0 = F.interpolate(x=coords_t0, size=(h, w), mode="bilinear")
coords_t0 = paddle.transpose(x=coords_t0, perm=(0, 2, 3, 1))
- warped = F.grid_sample(
- x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection")
+ warped = F.grid_sample(x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection")
return warped
-def create_motion_field(motion_field_strength_x, motion_field_strength_y,
- frame_ids, dtype):
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, dtype):
"""
Create translation motion field
@@ -184,15 +171,12 @@ def create_motion_field(motion_field_strength_x, motion_field_strength_y,
seq_length = len(frame_ids)
reference_flow = paddle.zeros(shape=(seq_length, 2, 512, 512), dtype=dtype)
for fr_idx in range(seq_length):
- reference_flow[(fr_idx), (0), :, :] = (motion_field_strength_x *
- frame_ids[fr_idx])
- reference_flow[(fr_idx), (1), :, :] = (motion_field_strength_y *
- frame_ids[fr_idx])
+ reference_flow[(fr_idx), (0), :, :] = motion_field_strength_x * frame_ids[fr_idx]
+ reference_flow[(fr_idx), (1), :, :] = motion_field_strength_y * frame_ids[fr_idx]
return reference_flow
-def create_motion_field_and_warp_latents(
- motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
"""
Creates translation motion and warps the latents accordingly
@@ -210,11 +194,11 @@ def create_motion_field_and_warp_latents(
motion_field_strength_x=motion_field_strength_x,
motion_field_strength_y=motion_field_strength_y,
frame_ids=frame_ids,
- dtype=latents.dtype, )
+ dtype=latents.dtype,
+ )
warped_latents = latents.clone().detach()
for i in range(len(warped_latents)):
- warped_latents[i] = warp_single_latent(latents[i][None],
- motion_field[i][None])
+ warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
return warped_latents
@@ -244,15 +228,16 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
"""
def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool=True, ):
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
super().__init__(
vae,
text_encoder,
@@ -261,7 +246,8 @@ def __init__(
scheduler,
safety_checker,
feature_extractor,
- requires_safety_checker, )
+ requires_safety_checker,
+ )
self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
def forward_loop(self, x_t0, t0, t1, generator):
@@ -277,24 +263,23 @@ def forward_loop(self, x_t0, t0, t1, generator):
Returns:
x_t1: forward process applied to x_t0 from time t0 to t1.
"""
- eps = paddle.randn(
- shape=x_t0.shape, generator=generator, dtype=x_t0.dtype)
+ eps = paddle.randn(shape=x_t0.shape, generator=generator, dtype=x_t0.dtype)
alpha_vec = paddle.prod(x=self.scheduler.alphas[t0:t1])
- x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 -
- alpha_vec) * eps
+ x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 - alpha_vec) * eps
return x_t1
def backward_loop(
- self,
- latents,
- timesteps,
- prompt_embeds,
- guidance_scale,
- callback,
- callback_steps,
- num_warmup_steps,
- extra_step_kwargs,
- cross_attention_kwargs=None, ):
+ self,
+ latents,
+ timesteps,
+ prompt_embeds,
+ guidance_scale,
+ callback,
+ callback_steps,
+ num_warmup_steps,
+ extra_step_kwargs,
+ cross_attention_kwargs=None,
+ ):
"""
Perform backward process given list of time steps
@@ -326,32 +311,27 @@ def backward_loop(
with self.progress_bar(total=num_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat(x=[latents] * 2) if
- do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
# perform guidance
if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(
- chunks=2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -359,27 +339,27 @@ def backward_loop(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- video_length: Optional[int]=8,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_videos_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- motion_field_strength_x: float=12,
- motion_field_strength_y: float=12,
- output_type: Optional[str]="tensor",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- t0: int=44,
- t1: int=47, ):
+ self,
+ prompt: Union[str, List[str]],
+ video_length: Optional[int] = 8,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_videos_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ motion_field_strength_x: float = 12,
+ motion_field_strength_y: float = 12,
+ output_type: Optional[str] = "tensor",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ t0: int = 44,
+ t1: int = 47,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -471,12 +451,14 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# Encode input prompt
- prompt_embeds = self._encode_prompt(prompt, num_videos_per_prompt,
- do_classifier_free_guidance,
- negative_prompt)
+ prompt_embeds = self._encode_prompt(
+ prompt, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
# Prepare timesteps
- self.scheduler.set_timesteps(num_inference_steps, )
+ self.scheduler.set_timesteps(
+ num_inference_steps,
+ )
timesteps = self.scheduler.timesteps
# Prepare latent variables
@@ -488,35 +470,37 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
# Perform the first backward process up to time T_1
x_1_t1 = self.backward_loop(
- timesteps=timesteps[:-t1 - 1],
+ timesteps=timesteps[: -t1 - 1],
prompt_embeds=prompt_embeds,
latents=latents,
guidance_scale=guidance_scale,
callback=callback,
callback_steps=callback_steps,
extra_step_kwargs=extra_step_kwargs,
- num_warmup_steps=num_warmup_steps, )
+ num_warmup_steps=num_warmup_steps,
+ )
scheduler_copy = copy.deepcopy(self.scheduler)
# Perform the second backward process up to time T_0
x_1_t0 = self.backward_loop(
- timesteps=timesteps[-t1 - 1:-t0 - 1],
+ timesteps=timesteps[-t1 - 1 : -t0 - 1],
prompt_embeds=prompt_embeds,
latents=x_1_t1,
guidance_scale=guidance_scale,
callback=callback,
callback_steps=callback_steps,
extra_step_kwargs=extra_step_kwargs,
- num_warmup_steps=0, )
+ num_warmup_steps=0,
+ )
# Propagate first frame latents at time T_0 to remaining frames
x_2k_t0 = x_1_t0.tile(repeat_times=[video_length - 1, 1, 1, 1])
@@ -526,31 +510,34 @@ def __call__(
motion_field_strength_x=motion_field_strength_x,
motion_field_strength_y=motion_field_strength_y,
latents=x_2k_t0,
- frame_ids=frame_ids[1:], )
+ frame_ids=frame_ids[1:],
+ )
# Perform forward process up to time T_1
x_2k_t1 = self.forward_loop(
x_t0=x_2k_t0,
t0=timesteps[-t0 - 1].item(),
t1=timesteps[-t1 - 1].item(),
- generator=generator, )
+ generator=generator,
+ )
# Perform backward process from time T_1 to 0
x_1k_t1 = paddle.concat(x=[x_1_t1, x_2k_t1])
b, l, d = prompt_embeds.shape
- prompt_embeds = (prompt_embeds[:, (None)]
- .tile(repeat_times=[1, video_length, 1, 1])
- .reshape([b * video_length, l, d]))
+ prompt_embeds = (
+ prompt_embeds[:, (None)].tile(repeat_times=[1, video_length, 1, 1]).reshape([b * video_length, l, d])
+ )
self.scheduler = scheduler_copy
x_1k_0 = self.backward_loop(
- timesteps=timesteps[-t1 - 1:],
+ timesteps=timesteps[-t1 - 1 :],
prompt_embeds=prompt_embeds,
latents=x_1k_t1,
guidance_scale=guidance_scale,
callback=callback,
callback_steps=callback_steps,
extra_step_kwargs=extra_step_kwargs,
- num_warmup_steps=0, )
+ num_warmup_steps=0,
+ )
latents = x_1k_0
paddle.device.cuda.empty_cache()
if output_type == "latent":
@@ -558,9 +545,7 @@ def __call__(
has_nsfw_concept = None
else:
image = self.decode_latents(latents)
- image, has_nsfw_concept = self.run_safety_checker(
- image, prompt_embeds.dtype)
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
return image, has_nsfw_concept
- return TextToVideoPipelineOutput(
- images=image, nsfw_content_detected=has_nsfw_concept)
+ return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
index 90e39132e944b..4fa798729384f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
@@ -12,15 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
- is_paddlenlp_available)
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
try:
if not (is_paddlenlp_available() and is_paddle_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_paddlenlp_objects import (
- UnCLIPImageVariationPipeline, UnCLIPPipeline)
+ UnCLIPImageVariationPipeline,
+ UnCLIPPipeline,
+ )
else:
from .pipeline_unclip import UnCLIPPipeline
from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
index 4c591a6c434cb..9f9d905244ac2 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
@@ -75,17 +75,18 @@ class UnCLIPPipeline(DiffusionPipeline):
super_res_scheduler: UnCLIPScheduler
def __init__(
- self,
- prior: PriorTransformer,
- decoder: UNet2DConditionModel,
- text_encoder: CLIPTextModelWithProjection,
- tokenizer: CLIPTokenizer,
- text_proj: UnCLIPTextProjModel,
- super_res_first: UNet2DModel,
- super_res_last: UNet2DModel,
- prior_scheduler: UnCLIPScheduler,
- decoder_scheduler: UnCLIPScheduler,
- super_res_scheduler: UnCLIPScheduler, ):
+ self,
+ prior: PriorTransformer,
+ decoder: UNet2DConditionModel,
+ text_encoder: CLIPTextModelWithProjection,
+ tokenizer: CLIPTokenizer,
+ text_proj: UnCLIPTextProjModel,
+ super_res_first: UNet2DModel,
+ super_res_last: UNet2DModel,
+ prior_scheduler: UnCLIPScheduler,
+ decoder_scheduler: UnCLIPScheduler,
+ super_res_scheduler: UnCLIPScheduler,
+ ):
super().__init__()
self.register_modules(
@@ -98,27 +99,27 @@ def __init__(
super_res_last=super_res_last,
prior_scheduler=prior_scheduler,
decoder_scheduler=decoder_scheduler,
- super_res_scheduler=super_res_scheduler, )
+ super_res_scheduler=super_res_scheduler,
+ )
def prepare_latents(self, shape, dtype, generator, latents, scheduler):
if latents is None:
latents = randn_tensor(shape, generator=generator, dtype=dtype)
else:
if latents.shape != list(shape):
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents * scheduler.init_noise_sigma
return latents
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
- text_attention_mask: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+ text_attention_mask: Optional[paddle.Tensor] = None,
+ ):
if text_model_output is None:
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
@@ -128,23 +129,24 @@ def _encode_prompt(
max_length=self.tokenizer.model_max_length,
truncation=True,
return_attention_mask=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
text_mask = text_inputs.attention_mask
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids,
- untruncated_ids):
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :
- self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
text_encoder_output = self.text_encoder(text_input_ids)
@@ -155,27 +157,26 @@ def _encode_prompt(
batch_size = text_model_output[0].shape[0]
prompt_embeds, text_encoder_hidden_states = (
text_model_output[0],
- text_model_output[1], )
+ text_model_output[1],
+ )
text_mask = text_attention_mask
# duplicate text embeddings for each generation per prompt
seq_len = prompt_embeds.shape[1]
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
- prompt_embeds = prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
# duplicate text_encoder_hidden_states for each generation per prompt
seq_len = text_encoder_hidden_states.shape[1]
- text_encoder_hidden_states = text_encoder_hidden_states.tile(
- [1, num_images_per_prompt, 1])
+ text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
text_encoder_hidden_states = text_encoder_hidden_states.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ [batch_size * num_images_per_prompt, seq_len, -1]
+ )
# duplicate text_mask for each generation per prompt
seq_len = text_mask.shape[1]
text_mask = text_mask.tile([1, num_images_per_prompt])
- text_mask = text_mask.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
# prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
# text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
@@ -190,47 +191,38 @@ def _encode_prompt(
max_length=self.tokenizer.model_max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_text_mask = uncond_input.attention_mask
- negative_prompt_embeds_text_encoder_output = self.text_encoder(
- uncond_input.input_ids)
+ negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
- negative_prompt_embeds = (
- negative_prompt_embeds_text_encoder_output.text_embeds)
- uncond_text_encoder_hidden_states = (
- negative_prompt_embeds_text_encoder_output.last_hidden_state)
+ negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+ uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
seq_len = uncond_text_encoder_hidden_states.shape[1]
- uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile(
- [1, num_images_per_prompt, 1])
- uncond_text_encoder_hidden_states = (
- uncond_text_encoder_hidden_states.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1]))
+ uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
+ uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
+ [batch_size * num_images_per_prompt, seq_len, -1]
+ )
# duplicate uncond_text_mask for each generation per prompt
seq_len = uncond_text_mask.shape[1]
uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
- uncond_text_mask = uncond_text_mask.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
# uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
# done duplicates
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
- text_encoder_hidden_states = paddle.concat([
- uncond_text_encoder_hidden_states, text_encoder_hidden_states
- ])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+ text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
text_mask = paddle.concat([uncond_text_mask, text_mask])
@@ -238,23 +230,23 @@ def _encode_prompt(
@paddle.no_grad()
def __call__(
- self,
- prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: int=1,
- prior_num_inference_steps: int=25,
- decoder_num_inference_steps: int=25,
- super_res_num_inference_steps: int=7,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prior_latents: Optional[paddle.Tensor]=None,
- decoder_latents: Optional[paddle.Tensor]=None,
- super_res_latents: Optional[paddle.Tensor]=None,
- text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
- text_attention_mask: Optional[paddle.Tensor]=None,
- prior_guidance_scale: float=4.0,
- decoder_guidance_scale: float=8.0,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ):
+ self,
+ prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: int = 1,
+ prior_num_inference_steps: int = 25,
+ decoder_num_inference_steps: int = 25,
+ super_res_num_inference_steps: int = 7,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prior_latents: Optional[paddle.Tensor] = None,
+ decoder_latents: Optional[paddle.Tensor] = None,
+ super_res_latents: Optional[paddle.Tensor] = None,
+ text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+ text_attention_mask: Optional[paddle.Tensor] = None,
+ prior_guidance_scale: float = 4.0,
+ decoder_guidance_scale: float = 8.0,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -312,23 +304,21 @@ def __call__(
elif isinstance(prompt, list):
batch_size = len(prompt)
else:
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
else:
batch_size = text_model_output[0].shape[0]
batch_size = batch_size * num_images_per_prompt
- do_classifier_free_guidance = (prior_guidance_scale > 1.0 or
- decoder_guidance_scale > 1.0)
+ do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
text_model_output,
- text_attention_mask, )
+ text_attention_mask,
+ )
# prior
@@ -342,30 +332,29 @@ def __call__(
prompt_embeds.dtype,
generator,
prior_latents,
- self.prior_scheduler, )
+ self.prior_scheduler,
+ )
for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([prior_latents] * 2)
- if do_classifier_free_guidance else
- prior_latents)
+ latent_model_input = paddle.concat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
predicted_image_embedding = self.prior(
latent_model_input,
timestep=t,
proj_embedding=prompt_embeds,
encoder_hidden_states=text_encoder_hidden_states,
- attention_mask=text_mask, ).predicted_image_embedding
+ attention_mask=text_mask,
+ ).predicted_image_embedding
if do_classifier_free_guidance:
(
predicted_image_embedding_uncond,
predicted_image_embedding_text,
) = predicted_image_embedding.chunk(2)
- predicted_image_embedding = (
- predicted_image_embedding_uncond + prior_guidance_scale *
- (predicted_image_embedding_text -
- predicted_image_embedding_uncond))
+ predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+ predicted_image_embedding_text - predicted_image_embedding_uncond
+ )
if i + 1 == prior_timesteps_tensor.shape[0]:
prev_timestep = None
@@ -377,7 +366,8 @@ def __call__(
timestep=t,
sample=prior_latents,
generator=generator,
- prev_timestep=prev_timestep, ).prev_sample
+ prev_timestep=prev_timestep,
+ ).prev_sample
prior_latents = self.prior.post_process_latents(prior_latents)
@@ -390,13 +380,15 @@ def __call__(
image_embeddings=image_embeddings,
prompt_embeds=prompt_embeds,
text_encoder_hidden_states=text_encoder_hidden_states,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
decoder_text_mask = F.pad(
text_mask.unsqueeze(0),
(self.text_proj.clip_extra_context_tokens, 0),
value=1,
- data_format="NCL", ).squeeze(0)
+ data_format="NCL",
+ ).squeeze(0)
self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
decoder_timesteps_tensor = self.decoder_scheduler.timesteps
@@ -410,20 +402,22 @@ def __call__(
text_encoder_hidden_states.dtype,
generator,
decoder_latents,
- self.decoder_scheduler, )
+ self.decoder_scheduler,
+ )
for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([decoder_latents] * 2)
- if do_classifier_free_guidance else
- decoder_latents)
+ latent_model_input = (
+ paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+ )
noise_pred = self.decoder(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=text_encoder_hidden_states,
class_labels=additive_clip_time_embeddings,
- attention_mask=decoder_text_mask, ).sample
+ attention_mask=decoder_text_mask,
+ ).sample
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -431,20 +425,19 @@ def __call__(
noise_pred_uncond, _ = noise_pred_uncond.split(
[
latent_model_input.shape[1],
- noise_pred_uncond.shape[1] -
- latent_model_input.shape[1],
+ noise_pred_uncond.shape[1] - latent_model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
latent_model_input.shape[1],
noise_pred_text.shape[1] - latent_model_input.shape[1],
],
- axis=1, )
- noise_pred = noise_pred_uncond + decoder_guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
if i + 1 == decoder_timesteps_tensor.shape[0]:
prev_timestep = None
@@ -457,7 +450,8 @@ def __call__(
t,
decoder_latents,
prev_timestep=prev_timestep,
- generator=generator, ).prev_sample
+ generator=generator,
+ ).prev_sample
decoder_latents = decoder_latents.clip(-1, 1)
@@ -479,7 +473,8 @@ def __call__(
image_small.dtype,
generator,
super_res_latents,
- self.super_res_scheduler, )
+ self.super_res_scheduler,
+ )
interpolate_antialias = {}
if "antialias" in inspect.signature(F.interpolate).parameters:
@@ -490,7 +485,8 @@ def __call__(
size=[height, width],
mode="bicubic",
align_corners=False,
- **interpolate_antialias, )
+ **interpolate_antialias,
+ )
for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
# no classifier free guidance
@@ -501,15 +497,14 @@ def __call__(
unet = self.super_res_first
latent_model_input = paddle.concat(
- [
- super_res_latents,
- image_upscaled.cast(super_res_latents.dtype)
- ],
- axis=1, )
+ [super_res_latents, image_upscaled.cast(super_res_latents.dtype)],
+ axis=1,
+ )
noise_pred = unet(
sample=latent_model_input,
- timestep=t, ).sample
+ timestep=t,
+ ).sample
if i + 1 == super_res_timesteps_tensor.shape[0]:
prev_timestep = None
@@ -522,7 +517,8 @@ def __call__(
t,
super_res_latents,
prev_timestep=prev_timestep,
- generator=generator, ).prev_sample
+ generator=generator,
+ ).prev_sample
image = super_res_latents
# done super res
@@ -537,6 +533,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index ada35969b9c65..f303633b838ee 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -18,9 +18,12 @@
import paddle
import paddle.nn.functional as F
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPTextModelWithProjection, CLIPTokenizer,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
from ...models import UNet2DConditionModel, UNet2DModel
from ...pipelines import DiffusionPipeline, ImagePipelineOutput
@@ -78,17 +81,18 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline):
super_res_scheduler: UnCLIPScheduler
def __init__(
- self,
- decoder: UNet2DConditionModel,
- text_encoder: CLIPTextModelWithProjection,
- tokenizer: CLIPTokenizer,
- text_proj: UnCLIPTextProjModel,
- feature_extractor: CLIPImageProcessor,
- image_encoder: CLIPVisionModelWithProjection,
- super_res_first: UNet2DModel,
- super_res_last: UNet2DModel,
- decoder_scheduler: UnCLIPScheduler,
- super_res_scheduler: UnCLIPScheduler, ):
+ self,
+ decoder: UNet2DConditionModel,
+ text_encoder: CLIPTextModelWithProjection,
+ tokenizer: CLIPTokenizer,
+ text_proj: UnCLIPTextProjModel,
+ feature_extractor: CLIPImageProcessor,
+ image_encoder: CLIPVisionModelWithProjection,
+ super_res_first: UNet2DModel,
+ super_res_last: UNet2DModel,
+ decoder_scheduler: UnCLIPScheduler,
+ super_res_scheduler: UnCLIPScheduler,
+ ):
super().__init__()
self.register_modules(
@@ -101,7 +105,8 @@ def __init__(
super_res_first=super_res_first,
super_res_last=super_res_last,
decoder_scheduler=decoder_scheduler,
- super_res_scheduler=super_res_scheduler, )
+ super_res_scheduler=super_res_scheduler,
+ )
# Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
def prepare_latents(self, shape, dtype, generator, latents, scheduler):
@@ -109,15 +114,12 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler):
latents = randn_tensor(shape, generator=generator, dtype=dtype)
else:
if latents.shape != list(shape):
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
- )
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents * scheduler.init_noise_sigma
return latents
- def _encode_prompt(self, prompt, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
@@ -126,7 +128,8 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
return_attention_mask=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
text_mask = text_inputs.attention_mask
text_encoder_output = self.text_encoder(text_input_ids)
@@ -137,21 +140,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
# duplicate text embeddings for each generation per prompt
seq_len = prompt_embeds.shape[1]
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
- prompt_embeds = prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
# duplicate text_encoder_hidden_states for each generation per prompt
seq_len = text_encoder_hidden_states.shape[1]
- text_encoder_hidden_states = text_encoder_hidden_states.tile(
- [1, num_images_per_prompt, 1])
+ text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
text_encoder_hidden_states = text_encoder_hidden_states.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ [batch_size * num_images_per_prompt, seq_len, -1]
+ )
# duplicate text_mask for each generation per prompt
seq_len = text_mask.shape[1]
text_mask = text_mask.tile([1, num_images_per_prompt])
- text_mask = text_mask.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
# prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
# text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
@@ -167,91 +168,81 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
max_length=max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
uncond_text_mask = uncond_input.attention_mask
- negative_prompt_embeds_text_encoder_output = self.text_encoder(
- uncond_input.input_ids)
+ negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
- negative_prompt_embeds = (
- negative_prompt_embeds_text_encoder_output.text_embeds)
- uncond_text_encoder_hidden_states = (
- negative_prompt_embeds_text_encoder_output.last_hidden_state)
+ negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+ uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
seq_len = uncond_text_encoder_hidden_states.shape[1]
- uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile(
- [1, num_images_per_prompt, 1])
- uncond_text_encoder_hidden_states = (
- uncond_text_encoder_hidden_states.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1]))
+ uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
+ uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
+ [batch_size * num_images_per_prompt, seq_len, -1]
+ )
# duplicate uncond_text_mask for each generation per prompt
seq_len = uncond_text_mask.shape[1]
uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
- uncond_text_mask = uncond_text_mask.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
# uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
# done duplicates
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
- text_encoder_hidden_states = paddle.concat([
- uncond_text_encoder_hidden_states, text_encoder_hidden_states
- ])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+ text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
text_mask = paddle.concat([uncond_text_mask, text_mask])
return prompt_embeds, text_encoder_hidden_states, text_mask
def _encode_image(
- self,
- image,
- num_images_per_prompt,
- image_embeddings: Optional[paddle.Tensor]=None, ):
+ self,
+ image,
+ num_images_per_prompt,
+ image_embeddings: Optional[paddle.Tensor] = None,
+ ):
dtype = self.image_encoder.dtype
if image_embeddings is None:
if not isinstance(image, paddle.Tensor):
- image = self.feature_extractor(
- images=image, return_tensors="pd").pixel_values
+ image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
image = image.cast(dtype)
image_embeddings = self.image_encoder(image).image_embeds
batch_size, seq_len = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt])
- image_embeddings = image_embeddings.reshape(
- [batch_size * num_images_per_prompt, seq_len])
+ image_embeddings = image_embeddings.reshape([batch_size * num_images_per_prompt, seq_len])
# image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, axis=0)
return image_embeddings
@paddle.no_grad()
def __call__(
- self,
- image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image],
- paddle.Tensor]]=None,
- num_images_per_prompt: int=1,
- decoder_num_inference_steps: int=25,
- super_res_num_inference_steps: int=7,
- generator: Optional[paddle.Generator]=None,
- decoder_latents: Optional[paddle.Tensor]=None,
- super_res_latents: Optional[paddle.Tensor]=None,
- image_embeddings: Optional[paddle.Tensor]=None,
- decoder_guidance_scale: float=8.0,
- output_type: Optional[str]="pil",
- return_dict: bool=True, ):
+ self,
+ image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor]] = None,
+ num_images_per_prompt: int = 1,
+ decoder_num_inference_steps: int = 25,
+ super_res_num_inference_steps: int = 7,
+ generator: Optional[paddle.Generator] = None,
+ decoder_latents: Optional[paddle.Tensor] = None,
+ super_res_latents: Optional[paddle.Tensor] = None,
+ image_embeddings: Optional[paddle.Tensor] = None,
+ decoder_guidance_scale: float = 8.0,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ):
"""
Function invoked when calling the pipeline for generation.
@@ -307,23 +298,25 @@ def __call__(
do_classifier_free_guidance = decoder_guidance_scale > 1.0
prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
- prompt, num_images_per_prompt, do_classifier_free_guidance)
+ prompt, num_images_per_prompt, do_classifier_free_guidance
+ )
- image_embeddings = self._encode_image(image, num_images_per_prompt,
- image_embeddings)
+ image_embeddings = self._encode_image(image, num_images_per_prompt, image_embeddings)
# decoder
text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
image_embeddings=image_embeddings,
prompt_embeds=prompt_embeds,
text_encoder_hidden_states=text_encoder_hidden_states,
- do_classifier_free_guidance=do_classifier_free_guidance, )
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
decoder_text_mask = F.pad(
text_mask.unsqueeze(0),
(self.text_proj.clip_extra_context_tokens, 0),
value=1,
- data_format="NCL", ).squeeze(0)
+ data_format="NCL",
+ ).squeeze(0)
self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
decoder_timesteps_tensor = self.decoder_scheduler.timesteps
@@ -338,20 +331,22 @@ def __call__(
text_encoder_hidden_states.dtype,
generator,
decoder_latents,
- self.decoder_scheduler, )
+ self.decoder_scheduler,
+ )
for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([decoder_latents] * 2)
- if do_classifier_free_guidance else
- decoder_latents)
+ latent_model_input = (
+ paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+ )
noise_pred = self.decoder(
sample=latent_model_input,
timestep=t,
encoder_hidden_states=text_encoder_hidden_states,
class_labels=additive_clip_time_embeddings,
- attention_mask=decoder_text_mask, ).sample
+ attention_mask=decoder_text_mask,
+ ).sample
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -359,20 +354,19 @@ def __call__(
noise_pred_uncond, _ = noise_pred_uncond.split(
[
latent_model_input.shape[1],
- noise_pred_uncond.shape[1] -
- latent_model_input.shape[1],
+ noise_pred_uncond.shape[1] - latent_model_input.shape[1],
],
- axis=1, )
+ axis=1,
+ )
noise_pred_text, predicted_variance = noise_pred_text.split(
[
latent_model_input.shape[1],
noise_pred_text.shape[1] - latent_model_input.shape[1],
],
- axis=1, )
- noise_pred = noise_pred_uncond + decoder_guidance_scale * (
- noise_pred_text - noise_pred_uncond)
- noise_pred = paddle.concat(
- [noise_pred, predicted_variance], axis=1)
+ axis=1,
+ )
+ noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+ noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
if i + 1 == decoder_timesteps_tensor.shape[0]:
prev_timestep = None
@@ -385,7 +379,8 @@ def __call__(
t,
decoder_latents,
prev_timestep=prev_timestep,
- generator=generator, ).prev_sample
+ generator=generator,
+ ).prev_sample
decoder_latents = decoder_latents.clip(-1, 1)
@@ -408,7 +403,8 @@ def __call__(
image_small.dtype,
generator,
super_res_latents,
- self.super_res_scheduler, )
+ self.super_res_scheduler,
+ )
interpolate_antialias = {}
if "antialias" in inspect.signature(F.interpolate).parameters:
@@ -419,7 +415,8 @@ def __call__(
size=[height, width],
mode="bicubic",
align_corners=False,
- **interpolate_antialias, )
+ **interpolate_antialias,
+ )
for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
# no classifier free guidance
@@ -430,15 +427,14 @@ def __call__(
unet = self.super_res_first
latent_model_input = paddle.concat(
- [
- super_res_latents,
- image_upscaled.cast(super_res_latents.dtype)
- ],
- axis=1, )
+ [super_res_latents, image_upscaled.cast(super_res_latents.dtype)],
+ axis=1,
+ )
noise_pred = unet(
sample=latent_model_input,
- timestep=t, ).sample
+ timestep=t,
+ ).sample
if i + 1 == super_res_timesteps_tensor.shape[0]:
prev_timestep = None
@@ -451,7 +447,8 @@ def __call__(
t,
super_res_latents,
prev_timestep=prev_timestep,
- generator=generator, ).prev_sample
+ generator=generator,
+ ).prev_sample
image = super_res_latents
@@ -467,6 +464,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
index 3ce07c27f08b6..69b442fa526ee 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
@@ -29,53 +29,52 @@ class UnCLIPTextProjModel(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- *,
- clip_extra_context_tokens: int=4,
- clip_embeddings_dim: int=768,
- time_embed_dim: int,
- cross_attention_dim, ):
+ self,
+ *,
+ clip_extra_context_tokens: int = 4,
+ clip_embeddings_dim: int = 768,
+ time_embed_dim: int,
+ cross_attention_dim,
+ ):
super().__init__()
self.learned_classifier_free_guidance_embeddings = self.create_parameter(
- (clip_embeddings_dim, ),
+ (clip_embeddings_dim,),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(0.0), )
+ default_initializer=nn.initializer.Constant(0.0),
+ )
# parameters for additional clip time embeddings
self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
- self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(
- clip_embeddings_dim, time_embed_dim)
+ self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
# parameters for encoder hidden states
self.clip_extra_context_tokens = clip_extra_context_tokens
self.clip_extra_context_tokens_proj = nn.Linear(
- clip_embeddings_dim,
- self.clip_extra_context_tokens * cross_attention_dim)
- self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim,
- cross_attention_dim)
+ clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
+ )
+ self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim)
self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim)
def forward(
- self,
- *,
- image_embeddings,
- prompt_embeds,
- text_encoder_hidden_states,
- do_classifier_free_guidance, ):
+ self,
+ *,
+ image_embeddings,
+ prompt_embeds,
+ text_encoder_hidden_states,
+ do_classifier_free_guidance,
+ ):
image_embeddings = image_embeddings.cast(self.dtype)
if do_classifier_free_guidance:
# Add the classifier free guidance embeddings to the image embeddings
image_embeddings_batch_size = image_embeddings.shape[0]
- classifier_free_guidance_embeddings = (
- self.learned_classifier_free_guidance_embeddings.unsqueeze(0))
- classifier_free_guidance_embeddings = (
- classifier_free_guidance_embeddings.expand(
- [image_embeddings_batch_size, -1]))
- image_embeddings = paddle.concat(
- [classifier_free_guidance_embeddings, image_embeddings], axis=0)
+ classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0)
+ classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand(
+ [image_embeddings_batch_size, -1]
+ )
+ image_embeddings = paddle.concat([classifier_free_guidance_embeddings, image_embeddings], axis=0)
# The image embeddings batch size and the text embeddings batch size are equal
assert image_embeddings.shape[0] == prompt_embeds.shape[0]
@@ -85,26 +84,17 @@ def forward(
# "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and
# adding CLIP embeddings to the existing timestep embedding, ...
time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
- time_projected_image_embeddings = (
- self.clip_image_embeddings_project_to_time_embeddings(
- image_embeddings))
- additive_clip_time_embeddings = (
- time_projected_image_embeddings + time_projected_prompt_embeds)
+ time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
+ additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
# ... and by projecting CLIP embeddings into four
# extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
- clip_extra_context_tokens = self.clip_extra_context_tokens_proj(
- image_embeddings)
- clip_extra_context_tokens = clip_extra_context_tokens.reshape(
- [batch_size, -1, self.clip_extra_context_tokens])
- clip_extra_context_tokens = clip_extra_context_tokens.transpose(
- [0, 2, 1])
-
- text_encoder_hidden_states = self.encoder_hidden_states_proj(
- text_encoder_hidden_states)
- text_encoder_hidden_states = self.text_encoder_hidden_states_norm(
- text_encoder_hidden_states)
- text_encoder_hidden_states = paddle.concat(
- [clip_extra_context_tokens, text_encoder_hidden_states], axis=1)
+ clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
+ clip_extra_context_tokens = clip_extra_context_tokens.reshape([batch_size, -1, self.clip_extra_context_tokens])
+ clip_extra_context_tokens = clip_extra_context_tokens.transpose([0, 2, 1])
+
+ text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
+ text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states)
+ text_encoder_hidden_states = paddle.concat([clip_extra_context_tokens, text_encoder_hidden_states], axis=1)
return text_encoder_hidden_states, additive_clip_time_embeddings
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
index 769e211a22e88..d0e447e0ef36e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
@@ -18,9 +18,13 @@
import numpy as np
import PIL
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
- is_einops_available, is_paddle_available,
- is_paddlenlp_available)
+from ...utils import (
+ BaseOutput,
+ OptionalDependencyNotAvailable,
+ is_einops_available,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
@dataclass
@@ -40,12 +44,12 @@ class ImageTextPipelineOutput(BaseOutput):
try:
- if not (is_paddlenlp_available() and is_paddle_available() and
- is_einops_available()):
+ if not (is_paddlenlp_available() and is_paddle_available() and is_einops_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
- from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import \
- UniDiffuserPipeline
+ from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import (
+ UniDiffuserPipeline,
+ )
from ...utils.dummy_paddle_and_paddlenlp_objects import CaptionDecoder
else:
from .caption_decoder import CaptionDecoder
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
index 5fd8b8659eb9a..81f5e5a0b5212 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
@@ -27,19 +27,20 @@
class CaptionDecoder(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- prefix_length: int=77,
- hidden_dim: int=64,
- vocab_size: int=50258,
- hidden_size: int=768,
- num_hidden_layers: int=12,
- intermediate_size: int=3072,
- hidden_act: int="gelu",
- hidden_dropout_prob: int=0.1,
- attention_probs_dropout_prob: int=0.1,
- max_position_embeddings: int=1024,
- initializer_range: int=0.02,
- eos_token_id: int=50257, ):
+ self,
+ prefix_length: int = 77,
+ hidden_dim: int = 64,
+ vocab_size: int = 50258,
+ hidden_size: int = 768,
+ num_hidden_layers: int = 12,
+ intermediate_size: int = 3072,
+ hidden_act: int = "gelu",
+ hidden_dropout_prob: int = 0.1,
+ attention_probs_dropout_prob: int = 0.1,
+ max_position_embeddings: int = 1024,
+ initializer_range: int = 0.02,
+ eos_token_id: int = 50257,
+ ):
super(CaptionDecoder, self).__init__()
self.prefix_length = prefix_length
config = GPTConfig(
@@ -52,25 +53,24 @@ def __init__(
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
- eos_token_id=eos_token_id, )
+ eos_token_id=eos_token_id,
+ )
self.gpt = GPTLMHeadModel(config)
self.hidden_dim = hidden_dim
- self.encode_prefix = (nn.Linear(hidden_size, hidden_dim)
- if hidden_dim is not None else nn.Identity())
- self.decode_prefix = (nn.Linear(hidden_dim, hidden_size)
- if hidden_dim is not None else nn.Identity())
+ self.encode_prefix = nn.Linear(hidden_size, hidden_dim) if hidden_dim is not None else nn.Identity()
+ self.decode_prefix = nn.Linear(hidden_dim, hidden_size) if hidden_dim is not None else nn.Identity()
def get_dummy_token(self, batch_size: int) -> paddle.Tensor:
- return paddle.zeros(
- [batch_size, self.prefix_length], dtype=paddle.int64)
+ return paddle.zeros([batch_size, self.prefix_length], dtype=paddle.int64)
def forward(
- self,
- tokens: paddle.Tensor,
- prefix: paddle.Tensor,
- attention_mask: Optional[paddle.Tensor]=None,
- labels: Optional[paddle.Tensor]=None, ):
+ self,
+ tokens: paddle.Tensor,
+ prefix: paddle.Tensor,
+ attention_mask: Optional[paddle.Tensor] = None,
+ labels: Optional[paddle.Tensor] = None,
+ ):
embedding_text = self.gpt.gpt.embeddings.word_embeddings(tokens)
hidden = self.encode_prefix(prefix)
prefix = self.decode_prefix(hidden)
@@ -79,9 +79,7 @@ def forward(
if labels is not None:
dummy_token = self.get_dummy_token(tokens.shape[0])
labels = paddle.concat((dummy_token, tokens), axis=1)
- out = self.gpt(inputs_embeds=embedding_cat,
- labels=labels,
- attention_mask=attention_mask)
+ out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask)
if self.hidden_dim:
return out, hidden
@@ -98,24 +96,21 @@ def generate_captions(self, tokenizer, features, use_beam_search=True):
for feature in features:
feature = self.decode_prefix(feature) # back to the clip feature
if use_beam_search:
- generated_captions.append(
- self.generate_beam(
- tokenizer=tokenizer, embedding=feature)[0])
+ generated_captions.append(self.generate_beam(tokenizer=tokenizer, embedding=feature)[0])
else:
- generated_captions.append(
- self.generate2(
- tokenizer=tokenizer, embedding=feature))
+ generated_captions.append(self.generate2(tokenizer=tokenizer, embedding=feature))
return generated_captions
@paddle.no_grad()
def generate_beam(
- self,
- tokenizer,
- prompt=None,
- embedding=None,
- beam_size: int=5,
- entry_length: int=67, # maximum number of words
- temperature: float=1.0, ):
+ self,
+ tokenizer,
+ prompt=None,
+ embedding=None,
+ beam_size: int = 5,
+ entry_length: int = 67, # maximum number of words
+ temperature: float = 1.0,
+ ):
stop_token_index = self.gpt.config.eos_token_id
tokens = None
scores = None
@@ -132,14 +127,12 @@ def generate_beam(
for i in range(entry_length):
logits = self.gpt(inputs_embeds=generated)
- logits = logits[:, -1, :] / (temperature
- if temperature > 0 else 1.0)
+ logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
logits = F.softmax(logits, axis=-1).log()
if scores is None:
scores, next_tokens = logits.topk(beam_size, -1)
generated = generated.expand([beam_size, *generated.shape[1:]])
- next_tokens, scores = next_tokens.transpose(
- [1, 0]), scores.squeeze(0)
+ next_tokens, scores = next_tokens.transpose([1, 0]), scores.squeeze(0)
if tokens is None:
tokens = next_tokens
else:
@@ -151,8 +144,7 @@ def generate_beam(
scores_sum = scores[:, None] + logits
seq_lengths[~is_stopped] += 1
scores_sum_average = scores_sum / seq_lengths[:, None]
- scores_sum_average, next_tokens = scores_sum_average.reshape(
- [-1]).topk(beam_size, -1)
+ scores_sum_average, next_tokens = scores_sum_average.reshape([-1]).topk(beam_size, -1)
next_tokens_source = next_tokens // scores_sum.shape[1]
seq_lengths = seq_lengths[next_tokens_source]
next_tokens = next_tokens % scores_sum.shape[1]
@@ -165,19 +157,18 @@ def generate_beam(
is_stopped = is_stopped[next_tokens_source]
is_stopped = paddle.cast(is_stopped, "bool")
- next_token_embed = self.gpt.get_input_embeddings()(
- next_tokens.squeeze()).reshape([generated.shape[0], 1, -1])
+ next_token_embed = self.gpt.get_input_embeddings()(next_tokens.squeeze()).reshape(
+ [generated.shape[0], 1, -1]
+ )
generated = paddle.concat((generated, next_token_embed), axis=1)
- is_stopped = paddle.bitwise_or(
- is_stopped, next_tokens.equal(stop_token_index).squeeze())
+ is_stopped = paddle.bitwise_or(is_stopped, next_tokens.equal(stop_token_index).squeeze())
if is_stopped.all():
break
scores = scores / seq_lengths
output_list = tokens.cpu().numpy()
output_texts = [
- tokenizer.decode(
- output[:int(length)], skip_special_tokens=True)
+ tokenizer.decode(output[: int(length)], skip_special_tokens=True)
for output, length in zip(output_list, seq_lengths)
]
order = scores.argsort(descending=True)
@@ -186,15 +177,16 @@ def generate_beam(
@paddle.no_grad()
def generate2(
- self,
- tokenizer,
- tokens=None,
- prompt=None,
- embedding=None,
- entry_count: int=1,
- entry_length: int=67, # maximum number of words
- top_p: float=0.8,
- temperature: float=1.0, ):
+ self,
+ tokenizer,
+ tokens=None,
+ prompt=None,
+ embedding=None,
+ entry_count: int = 1,
+ entry_length: int = 67, # maximum number of words
+ top_p: float = 0.8,
+ temperature: float = 1.0,
+ ):
generated_list = []
stop_token_index = self.gpt.config.eos_token_id
filter_value = -float("Inf")
@@ -210,16 +202,12 @@ def generate2(
for entry_idx in range(entry_length):
logits = self.gpt(inputs_embeds=generated)
- logits = logits[:, -1, :] / (temperature
- if temperature > 0 else 1.0)
+ logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
sorted_logits = paddle.sort(logits, descending=True)
sorted_indices = paddle.argsort(logits, descending=True)
- cumulative_probs = paddle.cumsum(
- F.softmax(
- sorted_logits, axis=-1), axis=-1)
+ cumulative_probs = paddle.cumsum(F.softmax(sorted_logits, axis=-1), axis=-1)
sorted_indices_to_remove = cumulative_probs > top_p
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
- ..., :-1].clone()
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
@@ -235,8 +223,7 @@ def generate2(
break
output_list = list(tokens.squeeze().cpu().numpy())
- output_text = tokenizer.decode(
- output_list, skip_special_tokens=True)
+ output_text = tokenizer.decode(output_list, skip_special_tokens=True)
generated_list.append(output_text)
return generated_list[0]
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
index 17bab677a8e47..c025b3e06973e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -19,9 +19,13 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
- CLIPTokenizer,
- CLIPVisionModelWithProjection, GPTTokenizer)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+ GPTTokenizer,
+)
from PIL import Image
from ...models import AutoencoderKL, UViTModel
@@ -37,15 +41,15 @@
def center_crop(width, height, img):
resample = {"box": Image.BOX, "lanczos": Image.LANCZOS}["lanczos"]
crop = np.min(img.shape[:2])
- img = img[(img.shape[0] - crop) // 2:(img.shape[0] + crop) // 2, (img.shape[
- 1] - crop) // 2:(img.shape[1] + crop) // 2, ] # center crop
+ img = img[
+ (img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2,
+ (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2,
+ ] # center crop
try:
img = Image.fromarray(img, "RGB")
except:
img = Image.fromarray(img)
- img = img.resize(
- (width, height),
- resample) # resize the center crop from [crop, crop] to [width, height]
+ img = img.resize((width, height), resample) # resize the center crop from [crop, crop] to [width, height]
return np.array(img).astype(np.uint8)
@@ -62,16 +66,17 @@ class UniDiffuserPipeline(DiffusionPipeline):
scheduler: DPMSolverUniDiffuserScheduler
def __init__(
- self,
- image_encoder: CLIPVisionModelWithProjection,
- image_feature_extractor: CLIPImageProcessor,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UViTModel,
- vae: AutoencoderKL,
- caption_decoder: CaptionDecoder,
- caption_tokenizer: GPTTokenizer,
- scheduler: DPMSolverUniDiffuserScheduler, ):
+ self,
+ image_encoder: CLIPVisionModelWithProjection,
+ image_feature_extractor: CLIPImageProcessor,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UViTModel,
+ vae: AutoencoderKL,
+ caption_decoder: CaptionDecoder,
+ caption_tokenizer: GPTTokenizer,
+ scheduler: DPMSolverUniDiffuserScheduler,
+ ):
super().__init__()
self.register_modules(
image_encoder=image_encoder,
@@ -82,51 +87,48 @@ def __init__(
vae=vae,
caption_decoder=caption_decoder,
caption_tokenizer=caption_tokenizer,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.num_channels_latents = vae.latent_channels # 4
self.image_encoder_clip_img_dim = image_encoder.config.projection_dim # 512
self.text_encoder_seq_len = tokenizer.model_max_length # 77
- self.text_encoder_text_dim = (
- text_encoder.config.hidden_size //
- text_encoder.config.num_attention_heads) # 64
+ self.text_encoder_text_dim = text_encoder.config.hidden_size // text_encoder.config.num_attention_heads # 64
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -139,10 +141,10 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
- def _infer_batch_size(self, mode, image, prompt, prompt_embeds,
- num_samples):
+ def _infer_batch_size(self, mode, image, prompt, prompt_embeds, num_samples):
if mode in ["t2i", "t2i2t"]:
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -169,20 +171,16 @@ def _split(self, x, height, width):
latent_width = width // self.vae_scale_factor
img_vae_dim = self.num_channels_latents * latent_height * latent_width
- img_vae, img_clip = x.split(
- [img_vae_dim, self.image_encoder_clip_img_dim], axis=1)
+ img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_clip_img_dim], axis=1)
img_vae = einops.rearrange(
img_vae,
"B (C H W) -> B C H W",
C=self.num_channels_latents,
H=latent_height,
- W=latent_width, )
- img_clip = einops.rearrange(
- img_clip,
- "B (L D) -> B L D",
- L=1,
- D=self.image_encoder_clip_img_dim)
+ W=latent_width,
+ )
+ img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
return img_vae, img_clip
def _combine(self, img_vae, img_clip):
@@ -205,24 +203,21 @@ def _split_joint(self, x, height, width):
img_vae_dim = self.num_channels_latents * latent_height * latent_width
text_dim = self.text_encoder_seq_len * self.text_encoder_text_dim
- img_vae, img_clip, text = x.split(
- [img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1)
+ img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1)
img_vae = einops.rearrange(
img_vae,
"B (C H W) -> B C H W",
C=self.num_channels_latents,
H=latent_height,
- W=latent_width, )
- img_clip = einops.rearrange(
- img_clip,
- "B (L D) -> B L D",
- L=1,
- D=self.image_encoder_clip_img_dim)
+ W=latent_width,
+ )
+ img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
text = einops.rearrange(
text,
"B (L D) -> B L D",
L=self.text_encoder_seq_len,
- D=self.text_encoder_text_dim, )
+ D=self.text_encoder_text_dim,
+ )
return img_vae, img_clip, text
def _combine_joint(self, img_vae, img_clip, text):
@@ -238,34 +233,29 @@ def _combine_joint(self, img_vae, img_clip, text):
# Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def encode_text_latents(
- self,
- prompt,
- num_images_per_prompt,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ negative_prompt=None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ ):
if prompt_embeds is None:
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
prompt_embeds = self.text_encoder(text_inputs.input_ids)[0]
return prompt_embeds
# Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
- def encode_image_vae_latents(self,
- image,
- batch_size,
- num_images_per_prompt,
- dtype,
- generator=None):
+ def encode_image_vae_latents(self, image, batch_size, num_images_per_prompt, dtype, generator=None):
if not isinstance(image, paddle.Tensor):
- raise ValueError(
- f"`image` has to be of type `paddle.Tensor`, but is {type(image)}"
- )
+ raise ValueError(f"`image` has to be of type `paddle.Tensor`, but is {type(image)}")
image = image.cast(dtype)
batch_size = batch_size * num_images_per_prompt
@@ -278,17 +268,14 @@ def encode_image_vae_latents(self,
# vae encode
if isinstance(generator, list):
image_latents = [
- self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
- * self.vae.scaling_factor for i in range(batch_size)
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) * self.vae.scaling_factor
+ for i in range(batch_size)
]
image_latents = paddle.concat(image_latents, axis=0)
else:
- image_latents = (
- self.vae.encode(image).latent_dist.sample(generator) *
- self.vae.scaling_factor)
+ image_latents = self.vae.encode(image).latent_dist.sample(generator) * self.vae.scaling_factor
- if (batch_size > image_latents.shape[0] and
- batch_size % image_latents.shape[0] != 0):
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
)
@@ -299,22 +286,20 @@ def encode_image_vae_latents(self,
# Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
def encode_image_clip_latents(
- self,
- image,
- batch_size,
- num_images_per_prompt,
- dtype, ):
+ self,
+ image,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ ):
batch_size = batch_size * num_images_per_prompt
# clip encode
- inputs = self.image_feature_extractor(
- images=Image.fromarray(image), return_tensors="pd").pixel_values
+ inputs = self.image_feature_extractor(images=Image.fromarray(image), return_tensors="pd").pixel_values
# TODO junnyu, support float16 we need cast dtype
- image_latents = self.image_encoder(
- inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1)
+ image_latents = self.image_encoder(inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1)
- if (batch_size > image_latents.shape[0] and
- batch_size % image_latents.shape[0] != 0):
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
)
@@ -333,13 +318,7 @@ def decode_image_latents(self, latents):
return image
# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
- def prepare_text_latents(self,
- batch_size,
- seq_len,
- hidden_size,
- dtype,
- generator,
- latents=None):
+ def prepare_text_latents(self, batch_size, seq_len, hidden_size, dtype, generator, latents=None):
# Prepare text latents for the CLIP embedded prompt.
shape = [batch_size, seq_len, hidden_size]
if isinstance(generator, list) and len(generator) != batch_size:
@@ -357,14 +336,15 @@ def prepare_text_latents(self,
# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_image_vae_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
# Prepare latents for the VAE embedded image.
shape = [
batch_size,
@@ -386,12 +366,7 @@ def prepare_image_vae_latents(
return latents
# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
- def prepare_image_clip_latents(self,
- batch_size,
- clip_img_dim,
- dtype,
- generator,
- latents=None):
+ def prepare_image_clip_latents(self, batch_size, clip_img_dim, dtype, generator, latents=None):
# Prepare latents for the CLIP embedded image.
shape = [batch_size, 1, clip_img_dim]
if isinstance(generator, list) and len(generator) != batch_size:
@@ -408,66 +383,61 @@ def prepare_image_clip_latents(self,
return latents
def get_noise_pred(
- self,
- mode,
- latents,
- t,
- img_vae,
- img_clip,
- prompt_embeds,
- N,
- guidance_scale,
- height,
- width,
- data_type=1,
- generator=None, ):
+ self,
+ mode,
+ latents,
+ t,
+ img_vae,
+ img_clip,
+ prompt_embeds,
+ N,
+ guidance_scale,
+ height,
+ width,
+ data_type=1,
+ generator=None,
+ ):
dtype = self.unet.dtype
if mode == "joint":
- img_vae_latents, img_clip_latents, text_latents = self._split_joint(
- latents, height, width)
+ img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width)
img_vae_out, img_clip_out, text_out = self.unet(
img=img_vae_latents,
clip_img=img_clip_latents,
text=text_latents,
t_img=t,
t_text=t,
- data_type=paddle.zeros_like(
- t, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+ )
x_out = self._combine_joint(img_vae_out, img_clip_out, text_out)
if guidance_scale == 0.0:
return x_out
- img_vae_T = randn_tensor(
- img_vae.shape, generator=generator, dtype=dtype)
- img_clip_T = randn_tensor(
- img_clip.shape, generator=generator, dtype=dtype)
+ img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
+ img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
_, _, text_out_uncond = self.unet(
img=img_vae_T,
clip_img=img_clip_T,
text=text_latents,
t_img=paddle.ones_like(t) * N,
t_text=t,
- data_type=paddle.zeros_like(
- t, dtype=paddle.int32) + data_type, )
- text_T = randn_tensor(
- prompt_embeds.shape, generator=generator, dtype=dtype)
+ data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+ )
+ text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
img_vae_out_uncond, img_clip_out_uncond, _ = self.unet(
img=img_vae_latents,
clip_img=img_clip_latents,
text=text_T,
t_img=t,
t_text=paddle.ones_like(t) * N,
- data_type=paddle.zeros_like(
- t, dtype=paddle.int32) + data_type, )
- x_out_uncond = self._combine_joint(
- img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
+ data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+ )
+ x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
return x_out + guidance_scale * (x_out - x_out_uncond)
elif mode == "t2i":
- img_vae_latents, img_clip_latents = self._split(latents, height,
- width)
+ img_vae_latents, img_clip_latents = self._split(latents, height, width)
t_text = paddle.zeros([t.shape[0]], dtype=paddle.int32)
img_vae_out, img_clip_out, text_out = self.unet(
img=img_vae_latents,
@@ -475,25 +445,23 @@ def get_noise_pred(
text=prompt_embeds,
t_img=t,
t_text=t_text,
- data_type=paddle.zeros_like(
- t_text, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+ )
img_out = self._combine(img_vae_out, img_clip_out)
if guidance_scale == 0.0:
return img_out
- text_T = randn_tensor(
- prompt_embeds.shape, generator=generator, dtype=dtype)
+ text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
img=img_vae_latents,
clip_img=img_clip_latents,
text=text_T,
t_img=t,
t_text=paddle.ones_like(t) * N,
- data_type=paddle.zeros_like(
- t_text, dtype=paddle.int32) + data_type, )
- img_out_uncond = self._combine(img_vae_out_uncond,
- img_clip_out_uncond)
+ data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+ )
+ img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond)
return img_out + guidance_scale * (img_out - img_out_uncond)
@@ -505,23 +473,21 @@ def get_noise_pred(
text=latents,
t_img=t_img,
t_text=t,
- data_type=paddle.zeros_like(
- t_img, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t_img, dtype=paddle.int32) + data_type,
+ )
if guidance_scale == 0.0:
return text_out
- img_vae_T = randn_tensor(
- img_vae.shape, generator=generator, dtype=dtype)
- img_clip_T = randn_tensor(
- img_clip.shape, generator=generator, dtype=dtype)
+ img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
+ img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
img=img_vae_T,
clip_img=img_clip_T,
text=latents,
t_img=paddle.ones_like(t) * N,
t_text=t,
- data_type=paddle.zeros_like(
- t, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+ )
return text_out + guidance_scale * (text_out - text_out_uncond)
elif mode == "t":
@@ -531,13 +497,12 @@ def get_noise_pred(
text=latents,
t_img=paddle.ones_like(t) * N,
t_text=t,
- data_type=paddle.zeros_like(
- t, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+ )
return text_out
elif mode == "i":
- img_vae_latents, img_clip_latents = self._split(latents, height,
- width)
+ img_vae_latents, img_clip_latents = self._split(latents, height, width)
t_text = paddle.ones_like(t) * N
img_vae_out, img_clip_out, text_out = self.unet(
img=img_vae_latents,
@@ -545,8 +510,8 @@ def get_noise_pred(
text=prompt_embeds,
t_img=t,
t_text=t_text,
- data_type=paddle.zeros_like(
- t_text, dtype=paddle.int32) + data_type, )
+ data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+ )
img_out = self._combine(img_vae_out, img_clip_out)
return img_out
@@ -557,36 +522,34 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def _denoising_sample_fn(
- self,
- mode,
- image_vae_latents,
- image_clip_latents,
- prompt_embeds,
- num_inference_steps,
- extra_step_kwargs,
- guidance_scale,
- height,
- width,
- callback,
- callback_steps, ):
+ self,
+ mode,
+ image_vae_latents,
+ image_clip_latents,
+ prompt_embeds,
+ num_inference_steps,
+ extra_step_kwargs,
+ guidance_scale,
+ height,
+ width,
+ callback,
+ callback_steps,
+ ):
# Prepare latent variables
if mode == "joint":
- latents = self._combine_joint(image_vae_latents, image_clip_latents,
- prompt_embeds)
+ latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds)
elif mode in ["t2i", "i"]:
latents = self._combine(image_vae_latents, image_clip_latents)
elif mode in ["i2t", "t"]:
@@ -599,8 +562,7 @@ def _denoising_sample_fn(
timesteps = self.scheduler.timesteps
N = self.scheduler.config.num_train_timesteps
- num_warmup_steps = len(
- timesteps) - num_inference_steps * self.scheduler.order
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
noise_pred = self.get_noise_pred(
@@ -613,27 +575,23 @@ def _denoising_sample_fn(
N,
guidance_scale,
height,
- width, )
+ width,
+ )
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and
- (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
if mode == "joint":
- image_vae_latents, image_clip_latents, text_latents = self._split_joint(
- latents, height, width)
+ image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width)
return image_vae_latents, image_clip_latents, text_latents
elif mode in ["t2i", "i"]:
- image_vae_latents, image_clip_latents = self._split(latents, height,
- width)
+ image_vae_latents, image_clip_latents = self._split(latents, height, width)
return image_vae_latents, image_clip_latents
elif mode in ["i2t", "t"]:
text_latents = latents
@@ -641,32 +599,32 @@ def _denoising_sample_fn(
@paddle.no_grad()
def __call__(
- self,
- mode: str="t2i", # t2i, i2t, t2i2t, i2t2i, joint, i, t
- image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None,
- prompt: Optional[Union[str, List[str]]]=None,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.0,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- num_prompts_per_image: Optional[int]=1,
- num_samples: int=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- prompt_latents: Optional[paddle.Tensor]=None,
- vae_latents: Optional[paddle.Tensor]=None,
- clip_latents: Optional[paddle.Tensor]=None,
- prompt_embeds: Optional[paddle.Tensor]=None,
- negative_prompt_embeds: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- use_beam_search: Optional[bool]=True,
- **kwargs, ):
+ self,
+ mode: str = "t2i", # t2i, i2t, t2i2t, i2t2i, joint, i, t
+ image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
+ prompt: Optional[Union[str, List[str]]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ num_prompts_per_image: Optional[int] = 1,
+ num_samples: int = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ prompt_latents: Optional[paddle.Tensor] = None,
+ vae_latents: Optional[paddle.Tensor] = None,
+ clip_latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ use_beam_search: Optional[bool] = True,
+ **kwargs,
+ ):
# 0. Default height and width to unet
height = height or self.unet.config.img_size * self.vae_scale_factor
width = width or self.unet.config.img_size * self.vae_scale_factor
@@ -679,8 +637,7 @@ def __call__(
self.check_inputs([prompt], height, width, callback_steps)
# 2. Define call parameters
- batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds,
- num_samples)
+ batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds, num_samples)
# 3. Encode input prompt if available; otherwise prepare text latents
if mode in ["t2i", "t2i2t"]:
@@ -691,7 +648,8 @@ def __call__(
num_images_per_prompt,
negative_prompt,
prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds, )
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
# Encode contexts to lower text dim, 768 -> 64
prompt_embeds = self.unet.encode_prefix(prompt_embeds)
else:
@@ -700,10 +658,10 @@ def __call__(
batch_size,
self.text_encoder_seq_len,
self.text_encoder_text_dim,
- paddle.
- float32, # Placeholder, need to determine correct thing to do for dtype
+ paddle.float32, # Placeholder, need to determine correct thing to do for dtype
generator,
- prompt_latents, )
+ prompt_latents,
+ )
# 4. Encode input image if available; otherwise prepare image latents
if mode in ["i2t", "i2t2i"]:
@@ -716,7 +674,8 @@ def __call__(
image_crop,
batch_size,
num_prompts_per_image, # not num_images_per_prompt
- prompt_embeds.dtype, )
+ prompt_embeds.dtype,
+ )
# Encode image using VAE
image_vae = (image_crop / 127.5 - 1.0).astype(np.float32)
image_vae = einops.rearrange(image_vae, "h w c -> 1 c h w")
@@ -725,7 +684,8 @@ def __call__(
batch_size,
num_prompts_per_image, # not num_images_per_prompt
prompt_embeds.dtype,
- generator, )
+ generator,
+ )
else:
# 4.2. Prepare image latent variables, if necessary
@@ -735,7 +695,8 @@ def __call__(
self.image_encoder_clip_img_dim,
prompt_embeds.dtype,
generator,
- clip_latents, )
+ clip_latents,
+ )
# Prepare image VAE latents
image_vae_latents = self.prepare_image_vae_latents(
batch_size * num_images_per_prompt,
@@ -744,7 +705,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- vae_latents, )
+ vae_latents,
+ )
# 5. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -762,7 +724,8 @@ def __call__(
height,
width,
callback,
- callback_steps, )
+ callback_steps,
+ )
elif mode in ["i2t2i"]:
# 'i2t2i' should do 'i2t' first
outs = self._denoising_sample_fn(
@@ -776,7 +739,8 @@ def __call__(
height,
width,
callback,
- callback_steps, )
+ callback_steps,
+ )
elif mode in ["t2i2t"]:
# 't2i2t' should do 't2i' first
outs = self._denoising_sample_fn(
@@ -790,7 +754,8 @@ def __call__(
height,
width,
callback,
- callback_steps, )
+ callback_steps,
+ )
else:
raise ValueError
@@ -800,9 +765,8 @@ def __call__(
image_vae_latents, image_clip_latents, text_latents = outs
gen_image = self.decode_image_latents(image_vae_latents)
gen_text = self.caption_decoder.generate_captions(
- self.caption_tokenizer,
- text_latents,
- use_beam_search=use_beam_search)
+ self.caption_tokenizer, text_latents, use_beam_search=use_beam_search
+ )
elif mode in ["t2i", "i", "t2i2t"]:
image_vae_latents, image_clip_latents = outs
@@ -814,10 +778,10 @@ def __call__(
batch_size,
self.text_encoder_seq_len,
self.text_encoder_text_dim,
- paddle.
- float32, # Placeholder, need to determine correct thing to do for dtype
+ paddle.float32, # Placeholder, need to determine correct thing to do for dtype
generator,
- prompt_latents, )
+ prompt_latents,
+ )
text_latents = self._denoising_sample_fn(
"i2t",
image_vae_latents,
@@ -829,11 +793,13 @@ def __call__(
height,
width,
callback,
- callback_steps, )
+ callback_steps,
+ )
gen_text = self.caption_decoder.generate_captions(
self.caption_tokenizer,
text_latents,
- use_beam_search=use_beam_search, )
+ use_beam_search=use_beam_search,
+ )
elif mode in ["i2t", "t", "i2t2i"]:
text_latents = outs
@@ -841,7 +807,8 @@ def __call__(
gen_text = self.caption_decoder.generate_captions(
self.caption_tokenizer,
text_latents,
- use_beam_search=use_beam_search, )
+ use_beam_search=use_beam_search,
+ )
else:
# 'i2t2i' should do 't2i' later
# Prepare image CLIP latents
@@ -850,7 +817,8 @@ def __call__(
self.image_encoder_clip_img_dim,
prompt_embeds.dtype,
generator,
- clip_latents, )
+ clip_latents,
+ )
# Prepare image VAE latents
image_vae_latents = self.prepare_image_vae_latents(
batch_size * num_images_per_prompt,
@@ -859,7 +827,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- vae_latents, )
+ vae_latents,
+ )
image_vae_latents, image_clip_latents = self._denoising_sample_fn(
"t2i",
image_vae_latents,
@@ -871,7 +840,8 @@ def __call__(
height,
width,
callback,
- callback_steps, )
+ callback_steps,
+ )
gen_image = self.decode_image_latents(image_vae_latents)
# 8. Convert gen_image to PIL, gen_text has no else processing
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
index ac2ddb173413d..309b32b2d1129 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
@@ -13,8 +13,11 @@
# limitations under the License.
# flake8: noqa
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
- is_paddlenlp_available)
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
try:
if not (is_paddlenlp_available() and is_paddle_available()):
@@ -22,14 +25,19 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_paddlenlp_objects import (
VersatileDiffusionDualGuidedPipeline,
- VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline,
- VersatileDiffusionTextToImagePipeline)
+ VersatileDiffusionImageVariationPipeline,
+ VersatileDiffusionPipeline,
+ VersatileDiffusionTextToImagePipeline,
+ )
else:
from .modeling_text_unet import UNetFlatConditionModel
from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
- from .pipeline_versatile_diffusion_dual_guided import \
- VersatileDiffusionDualGuidedPipeline
- from .pipeline_versatile_diffusion_image_variation import \
- VersatileDiffusionImageVariationPipeline
- from .pipeline_versatile_diffusion_text_to_image import \
- VersatileDiffusionTextToImagePipeline
+ from .pipeline_versatile_diffusion_dual_guided import (
+ VersatileDiffusionDualGuidedPipeline,
+ )
+ from .pipeline_versatile_diffusion_image_variation import (
+ VersatileDiffusionImageVariationPipeline,
+ )
+ from .pipeline_versatile_diffusion_text_to_image import (
+ VersatileDiffusionTextToImagePipeline,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 69099f5186cf6..377ab850f1e93 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -24,11 +24,13 @@
from ...configuration_utils import ConfigMixin, register_to_config
from ...models import ModelMixin
from ...models.attention import Attention
-from ...models.attention_processor import (AttentionProcessor,
- AttnAddedKVProcessor, AttnProcessor)
+from ...models.attention_processor import (
+ AttentionProcessor,
+ AttnAddedKVProcessor,
+ AttnProcessor,
+)
from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import (GaussianFourierProjection, TimestepEmbedding,
- Timesteps)
+from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
from ...models.transformer_2d import Transformer2DModel
from ...models.unet_2d_condition import UNet2DConditionOutput
from ...utils import NEG_INF, deprecate, logging
@@ -37,30 +39,29 @@
def get_down_block(
- down_block_type,
- num_layers,
- in_channels,
- out_channels,
- temb_channels,
- add_downsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- downsample_padding=None,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default",
- resnet_skip_time_act=False, # HF missing in v0.16.1
- resnet_out_scale_factor=1.0, # HF missing in v0.16.1
- cross_attention_norm=None, # HF missing in v0.16.1
- resnet_pre_temb_non_linearity: bool=False, ):
- down_block_type = (down_block_type[7:]
- if down_block_type.startswith("UNetRes") else
- down_block_type)
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False, # HF missing in v0.16.1
+ resnet_out_scale_factor=1.0, # HF missing in v0.16.1
+ cross_attention_norm=None, # HF missing in v0.16.1
+ resnet_pre_temb_non_linearity: bool = False,
+):
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
if down_block_type == "DownBlockFlat":
return DownBlockFlat(
num_layers=num_layers,
@@ -73,12 +74,11 @@ def get_down_block(
resnet_groups=resnet_groups,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif down_block_type == "CrossAttnDownBlockFlat":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnDownBlockFlat"
- )
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
return CrossAttnDownBlockFlat(
num_layers=num_layers,
in_channels=in_channels,
@@ -95,34 +95,35 @@ def get_down_block(
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
raise ValueError(f"{down_block_type} is not supported.")
def get_up_block(
- up_block_type,
- num_layers,
- in_channels,
- out_channels,
- prev_output_channel,
- temb_channels,
- add_upsample,
- resnet_eps,
- resnet_act_fn,
- attn_num_head_channels,
- resnet_groups=None,
- cross_attention_dim=None,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- resnet_time_scale_shift="default",
- resnet_skip_time_act=False, # HF missing in v0.16.1
- resnet_out_scale_factor=1.0, # HF missing in v0.16.1
- cross_attention_norm=None, # HF missing in v0.16.1
- resnet_pre_temb_non_linearity: bool=False, ):
- up_block_type = (up_block_type[7:]
- if up_block_type.startswith("UNetRes") else up_block_type)
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False, # HF missing in v0.16.1
+ resnet_out_scale_factor=1.0, # HF missing in v0.16.1
+ cross_attention_norm=None, # HF missing in v0.16.1
+ resnet_pre_temb_non_linearity: bool = False,
+):
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
if up_block_type == "UpBlockFlat":
return UpBlockFlat(
num_layers=num_layers,
@@ -135,11 +136,11 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif up_block_type == "CrossAttnUpBlockFlat":
if cross_attention_dim is None:
- raise ValueError(
- "cross_attention_dim must be specified for CrossAttnUpBlockFlat")
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
return CrossAttnUpBlockFlat(
num_layers=num_layers,
in_channels=in_channels,
@@ -156,7 +157,8 @@ def get_up_block(
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
raise ValueError(f"{up_block_type} is not supported.")
@@ -236,54 +238,57 @@ class conditioning with `class_embed_type` equal to `None`.
@register_to_config
def __init__(
- self,
- sample_size: Optional[int]=None,
- in_channels: int=4,
- out_channels: int=4,
- center_input_sample: bool=False,
- flip_sin_to_cos: bool=True,
- freq_shift: int=0,
- down_block_types: Tuple[str]=(
- "CrossAttnDownBlockFlat",
- "CrossAttnDownBlockFlat",
- "CrossAttnDownBlockFlat",
- "DownBlockFlat", ),
- mid_block_type: Optional[str]="UNetMidBlockFlatCrossAttn",
- up_block_types: Tuple[str]=(
- "UpBlockFlat",
- "CrossAttnUpBlockFlat",
- "CrossAttnUpBlockFlat",
- "CrossAttnUpBlockFlat", ),
- only_cross_attention: Union[bool, Tuple[bool]]=False,
- block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
- layers_per_block: Union[int, Tuple[int]]=2,
- downsample_padding: int=1,
- mid_block_scale_factor: float=1,
- act_fn: str="silu",
- norm_num_groups: Optional[int]=32,
- norm_eps: float=1e-5,
- cross_attention_dim: Union[int, Tuple[int]]=1280,
- encoder_hid_dim: Optional[int]=None,
- attention_head_dim: Union[int, Tuple[int]]=8,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- class_embed_type: Optional[str]=None,
- num_class_embeds: Optional[int]=None,
- upcast_attention: bool=False,
- resnet_time_scale_shift: str="default",
- resnet_skip_time_act: bool=False,
- resnet_out_scale_factor: int=1.0,
- time_embedding_type: str="positional", # fourier, positional
- time_embedding_act_fn: Optional[str]=None,
- timestep_post_act: Optional[str]=None,
- time_cond_proj_dim: Optional[int]=None,
- conv_in_kernel: int=3,
- conv_out_kernel: int=3,
- projection_class_embeddings_input_dim: Optional[int]=None,
- class_embeddings_concat: bool=False,
- mid_block_only_cross_attention: Optional[bool]=None,
- cross_attention_norm: Optional[str]=None,
- resnet_pre_temb_non_linearity: Optional[bool]=False, ):
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ center_input_sample: bool = False,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlockFlat",
+ "CrossAttnDownBlockFlat",
+ "CrossAttnDownBlockFlat",
+ "DownBlockFlat",
+ ),
+ mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
+ up_block_types: Tuple[str] = (
+ "UpBlockFlat",
+ "CrossAttnUpBlockFlat",
+ "CrossAttnUpBlockFlat",
+ "CrossAttnUpBlockFlat",
+ ),
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: Union[int, Tuple[int]] = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: Optional[int] = 32,
+ norm_eps: float = 1e-5,
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
+ encoder_hid_dim: Optional[int] = None,
+ attention_head_dim: Union[int, Tuple[int]] = 8,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ class_embed_type: Optional[str] = None,
+ num_class_embeds: Optional[int] = None,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ resnet_skip_time_act: bool = False,
+ resnet_out_scale_factor: int = 1.0,
+ time_embedding_type: str = "positional", # fourier, positional
+ time_embedding_act_fn: Optional[str] = None,
+ timestep_post_act: Optional[str] = None,
+ time_cond_proj_dim: Optional[int] = None,
+ conv_in_kernel: int = 3,
+ conv_out_kernel: int = 3,
+ projection_class_embeddings_input_dim: Optional[int] = None,
+ class_embeddings_concat: bool = False,
+ mid_block_only_cross_attention: Optional[bool] = None,
+ cross_attention_norm: Optional[str] = None,
+ resnet_pre_temb_non_linearity: Optional[bool] = False,
+ ):
super().__init__()
self.sample_size = sample_size
@@ -292,7 +297,8 @@ def __init__(
if len(down_block_types) != len(up_block_types):
raise ValueError(
"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`:"
- f" {down_block_types}. `up_block_types`: {up_block_types}.")
+ f" {down_block_types}. `up_block_types`: {up_block_types}."
+ )
if len(block_out_channels) != len(down_block_types):
raise ValueError(
@@ -300,35 +306,28 @@ def __init__(
f" {block_out_channels}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- only_cross_attention,
- bool) and len(only_cross_attention) != len(down_block_types):
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
raise ValueError(
"Must provide the same number of `only_cross_attention` as `down_block_types`."
f" `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- attention_head_dim,
- int) and len(attention_head_dim) != len(down_block_types):
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
raise ValueError(
"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`:"
f" {attention_head_dim}. `down_block_types`: {down_block_types}."
)
- if isinstance(
- cross_attention_dim,
- list) and len(cross_attention_dim) != len(down_block_types):
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
raise ValueError(
"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`:"
f" {cross_attention_dim}. `down_block_types`: {down_block_types}."
)
- if not isinstance(
- layers_per_block,
- int) and len(layers_per_block) != len(down_block_types):
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
raise ValueError(
"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`:"
- f" {layers_per_block}. `down_block_types`: {down_block_types}.")
+ f" {layers_per_block}. `down_block_types`: {down_block_types}."
+ )
# input
conv_in_padding = (conv_in_kernel - 1) // 2
@@ -336,26 +335,25 @@ def __init__(
in_channels,
block_out_channels[0],
kernel_size=conv_in_kernel,
- padding=conv_in_padding, )
+ padding=conv_in_padding,
+ )
# time
if time_embedding_type == "fourier":
time_embed_dim = block_out_channels[0] * 2
if time_embed_dim % 2 != 0:
- raise ValueError(
- f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
- )
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
self.time_proj = GaussianFourierProjection(
time_embed_dim // 2,
set_W_to_weight=False,
log=False,
- flip_sin_to_cos=flip_sin_to_cos, )
+ flip_sin_to_cos=flip_sin_to_cos,
+ )
timestep_input_dim = time_embed_dim
elif time_embedding_type == "positional":
time_embed_dim = block_out_channels[0] * 4
- self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
- freq_shift)
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
else:
raise ValueError(
@@ -367,20 +365,18 @@ def __init__(
time_embed_dim,
act_fn=act_fn,
post_act_fn=timestep_post_act,
- cond_proj_dim=time_cond_proj_dim, )
+ cond_proj_dim=time_cond_proj_dim,
+ )
if encoder_hid_dim is not None:
- self.encoder_hid_proj = nn.Linear(encoder_hid_dim,
- cross_attention_dim)
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
else:
self.encoder_hid_proj = None
# class embedding
if class_embed_type is None and num_class_embeds is not None:
- self.class_embedding = nn.Embedding(num_class_embeds,
- time_embed_dim)
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
elif class_embed_type == "timestep":
- self.class_embedding = TimestepEmbedding(timestep_input_dim,
- time_embed_dim)
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
elif class_embed_type == "identity":
self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
elif class_embed_type == "projection":
@@ -395,15 +391,13 @@ def __init__(
# Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
# When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
# As a result, `TimestepEmbedding` can be passed arbitrary vectors.
- self.class_embedding = TimestepEmbedding(
- projection_class_embeddings_input_dim, time_embed_dim)
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
elif class_embed_type == "simple_projection":
if projection_class_embeddings_input_dim is None:
raise ValueError(
"`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
)
- self.class_embedding = nn.Linear(
- projection_class_embeddings_input_dim, time_embed_dim)
+ self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
else:
self.class_embedding = None
@@ -418,8 +412,7 @@ def __init__(
elif time_embedding_act_fn == "gelu":
self.time_embed_act = nn.GELU()
else:
- raise ValueError(
- f"Unsupported activation function: {time_embedding_act_fn}")
+ raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
self.down_blocks = nn.LayerList([])
self.up_blocks = nn.LayerList([])
@@ -440,18 +433,16 @@ def __init__(
if mid_block_only_cross_attention is None:
mid_block_only_cross_attention = only_cross_attention
- only_cross_attention = [only_cross_attention] * len(
- down_block_types)
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
if mid_block_only_cross_attention is None:
mid_block_only_cross_attention = False
if isinstance(attention_head_dim, int):
- attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
if isinstance(cross_attention_dim, int):
- cross_attention_dim = (
- cross_attention_dim, ) * len(down_block_types)
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
if isinstance(layers_per_block, int):
layers_per_block = [layers_per_block] * len(down_block_types)
@@ -492,7 +483,8 @@ def __init__(
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.down_blocks.append(down_block)
# mid
@@ -510,7 +502,8 @@ def __init__(
dual_cross_attention=dual_cross_attention,
use_linear_projection=use_linear_projection,
upcast_attention=upcast_attention,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
in_channels=block_out_channels[-1],
@@ -525,7 +518,8 @@ def __init__(
skip_time_act=resnet_skip_time_act,
only_cross_attention=mid_block_only_cross_attention,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
elif mid_block_type is None:
self.mid_block = None
else:
@@ -547,8 +541,7 @@ def __init__(
prev_output_channel = output_channel
output_channel = reversed_block_out_channels[i]
- input_channel = reversed_block_out_channels[min(
- i + 1, len(block_out_channels) - 1)]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
# add upsample block for all BUT final layer
if not is_final_block:
@@ -578,7 +571,8 @@ def __init__(
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
self.up_blocks.append(up_block)
prev_output_channel = output_channel
@@ -587,7 +581,8 @@ def __init__(
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[0],
num_groups=norm_num_groups,
- epsilon=norm_eps, )
+ epsilon=norm_eps,
+ )
self.conv_act = nn.Silu()
else:
self.conv_norm_out = None
@@ -598,16 +593,20 @@ def __init__(
block_out_channels[0],
out_channels,
kernel_size=conv_out_kernel,
- padding=conv_out_padding, )
+ padding=conv_out_padding,
+ )
@property
def in_channels(self):
deprecate(
"in_channels",
"1.0.0",
- ("Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
- " `unet.config.in_channels` instead"),
- standard_warn=False, )
+ (
+ "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
+ " `unet.config.in_channels` instead"
+ ),
+ standard_warn=False,
+ )
return self.config.in_channels
@property
@@ -620,16 +619,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
# set recursively
processors = {}
- def fn_recursive_add_processors(
- name: str,
- module: nn.Layer,
- processors: Dict[str, AttentionProcessor]):
+ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
if hasattr(module, "set_processor"):
processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
- fn_recursive_add_processors(f"{name}.{sub_name}", child,
- processors)
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
return processors
@@ -638,9 +633,7 @@ def fn_recursive_add_processors(
return processors
- def set_attn_processor(self,
- processor: Union[AttentionProcessor, Dict[
- str, AttentionProcessor]]):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Parameters:
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -665,8 +658,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
- fn_recursive_attn_processor(f"{name}.{sub_name}", child,
- processor)
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
@@ -714,8 +706,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
# make smallest slice possible
slice_size = num_sliceable_layers * [1]
- slice_size = (num_sliceable_layers * [slice_size]
- if not isinstance(slice_size, list) else slice_size)
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
@@ -727,14 +718,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
size = slice_size[i]
dim = sliceable_head_dims[i]
if size is not None and size > dim:
- raise ValueError(
- f"size {size} has to be smaller or equal to {dim}.")
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
# Recursively walk through all the children.
# Any children which exposes the set_attention_slice method
# gets the message
- def fn_recursive_set_attention_slice(module: nn.Layer,
- slice_size: List[int]):
+ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
if hasattr(module, "set_attention_slice"):
module.set_attention_slice(slice_size.pop())
@@ -747,24 +736,24 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(
- module,
- (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat,
- UpBlockFlat), ):
+ module,
+ (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat),
+ ):
module.gradient_checkpointing = value
def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- class_labels: Optional[paddle.Tensor]=None,
- timestep_cond: Optional[paddle.Tensor]=None,
- attention_mask: Optional[paddle.Tensor]=None,
- cross_attention_kwargs: Optional[Dict[str, Any]]=None,
- down_block_additional_residuals: Optional[Tuple[
- paddle.Tensor]]=None,
- mid_block_additional_residual: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[UNet2DConditionOutput, Tuple]:
r"""
Args:
sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -795,8 +784,7 @@ def forward(
upsample_size = None
if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
- logger.info(
- "Forward upsample size to force interpolation output size.")
+ logger.info("Forward upsample size to force interpolation output size.")
forward_upsample_size = True
# prepare attention_mask
@@ -816,7 +804,11 @@ def forward(
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timesteps = timesteps.expand([sample.shape[0], ])
+ timesteps = timesteps.expand(
+ [
+ sample.shape[0],
+ ]
+ )
t_emb = self.time_proj(timesteps)
# timesteps does not contain any weights and will always return f32 tensors
@@ -828,8 +820,7 @@ def forward(
if self.class_embedding is not None:
if class_labels is None:
- raise ValueError(
- "class_labels should be provided when num_class_embeds > 0")
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
# maybe cast it to float16
class_labels = class_labels.cast(self.dtype)
@@ -861,20 +852,15 @@ def forward(
# 3. down
- is_controlnet = (mid_block_additional_residual is not None and
- down_block_additional_residuals is not None)
- is_adapter = (mid_block_additional_residual is None and
- down_block_additional_residuals is not None)
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+ is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
- down_block_res_samples = (sample, )
+ down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
- if (hasattr(downsample_block, "has_cross_attention") and
- downsample_block.has_cross_attention):
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
additional_kwargs = {}
if is_adapter and len(down_block_additional_residuals) > 0:
- additional_kwargs[
- "additional_residuals"] = down_block_additional_residuals.pop(
- 0)
+ additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
sample, res_samples = downsample_block(
hidden_states=sample,
@@ -882,10 +868,10 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
- **additional_kwargs, )
+ **additional_kwargs,
+ )
else:
- sample, res_samples = downsample_block(
- hidden_states=sample, temb=emb)
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
if is_adapter and len(down_block_additional_residuals) > 0:
sample += down_block_additional_residuals.pop(0)
@@ -896,10 +882,10 @@ def forward(
new_down_block_res_samples = ()
for down_block_res_sample, down_block_additional_residual in zip(
- down_block_res_samples, down_block_additional_residuals):
- down_block_res_sample = (
- down_block_res_sample + down_block_additional_residual)
- new_down_block_res_samples += (down_block_res_sample, )
+ down_block_res_samples, down_block_additional_residuals
+ ):
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
+ new_down_block_res_samples += (down_block_res_sample,)
down_block_res_samples = new_down_block_res_samples
# 4. mid
@@ -909,7 +895,8 @@ def forward(
emb,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs, )
+ cross_attention_kwargs=cross_attention_kwargs,
+ )
if is_controlnet:
sample = sample + mid_block_additional_residual
@@ -918,17 +905,15 @@ def forward(
for i, upsample_block in enumerate(self.up_blocks):
is_final_block = i == len(self.up_blocks) - 1
- res_samples = down_block_res_samples[-len(upsample_block.resnets):]
- down_block_res_samples = down_block_res_samples[:-len(
- upsample_block.resnets)]
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
# if we have not reached the final block and need to forward the
# upsample size, we do it here
if not is_final_block and forward_upsample_size:
upsample_size = down_block_res_samples[-1].shape[2:]
- if (hasattr(upsample_block, "has_cross_attention") and
- upsample_block.has_cross_attention):
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
sample = upsample_block(
hidden_states=sample,
temb=emb,
@@ -936,13 +921,15 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
upsample_size=upsample_size,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
else:
sample = upsample_block(
hidden_states=sample,
temb=emb,
res_hidden_states_tuple=res_samples,
- upsample_size=upsample_size, )
+ upsample_size=upsample_size,
+ )
# 6. post-process
if self.conv_norm_out:
sample = self.conv_norm_out(sample)
@@ -950,72 +937,60 @@ def forward(
sample = self.conv_out(sample)
if not return_dict:
- return (sample, )
+ return (sample,)
return UNet2DConditionOutput(sample=sample)
class LinearMultiDim(nn.Linear):
- def __init__(self,
- in_features,
- out_features=None,
- second_dim=4,
- *args,
- **kwargs):
- in_features = ([in_features, second_dim, 1]
- if isinstance(in_features, int) else list(in_features))
+ def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
+ in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
if out_features is None:
out_features = in_features
- out_features = ([out_features, second_dim, 1] if
- isinstance(out_features, int) else list(out_features))
+ out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
self.in_features_multidim = in_features
self.out_features_multidim = out_features
self.n_dim = len(self.in_features_multidim)
- super().__init__(
- np.array(in_features).prod(), np.array(out_features).prod())
+ super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
self.in_features = self.weight.shape[0]
def forward(self, input_tensor, *args, **kwargs):
shape = input_tensor.shape
- input_tensor = input_tensor.reshape(
- [*shape[0:-self.n_dim], self.in_features])
+ input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_features])
output_tensor = super().forward(input_tensor)
- output_tensor = output_tensor.reshape(
- [*shape[0:-self.n_dim], *self.out_features_multidim])
+ output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_features_multidim])
return output_tensor
class ResnetBlockFlat(nn.Layer):
def __init__(
- self,
- *,
- in_channels,
- out_channels=None,
- dropout: float=0.0,
- temb_channels: int=512,
- groups: int=32,
- groups_out=None,
- pre_norm: bool=True,
- eps: float=1e-6,
- time_embedding_norm: str="default",
- use_in_shortcut=None,
- second_dim: int=4,
- pre_temb_non_linearity: bool=False,
- **kwargs, ):
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ dropout: float = 0.0,
+ temb_channels: int = 512,
+ groups: int = 32,
+ groups_out=None,
+ pre_norm: bool = True,
+ eps: float = 1e-6,
+ time_embedding_norm: str = "default",
+ use_in_shortcut=None,
+ second_dim: int = 4,
+ pre_temb_non_linearity: bool = False,
+ **kwargs,
+ ):
super().__init__()
self.pre_temb_non_linearity = pre_temb_non_linearity
self.pre_norm = pre_norm
self.pre_norm = True
- in_channels = ([in_channels, second_dim, 1]
- if isinstance(in_channels, int) else list(in_channels))
+ in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
self.in_channels_prod = np.array(in_channels).prod()
self.channels_multidim = in_channels
if out_channels is not None:
- out_channels = ([out_channels, second_dim, 1]
- if isinstance(out_channels, int) else
- list(out_channels))
+ out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
out_channels_prod = np.array(out_channels).prod()
self.out_channels_multidim = out_channels
else:
@@ -1026,26 +1001,23 @@ def __init__(
if groups_out is None:
groups_out = groups
- self.norm1 = nn.GroupNorm(
- num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps)
- self.conv1 = nn.Conv2D(
- self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+ self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps)
+ self.conv1 = nn.Conv2D(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
if temb_channels is not None:
self.time_emb_proj = nn.Linear(temb_channels, out_channels_prod)
else:
self.time_emb_proj = None
- self.norm2 = nn.GroupNorm(
- num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps)
+ self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps)
self.dropout = nn.Dropout(dropout)
- self.conv2 = nn.Conv2D(
- out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+ self.conv2 = nn.Conv2D(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
self.nonlinearity = nn.Silu()
- self.use_in_shortcut = (self.in_channels_prod != out_channels_prod
- if use_in_shortcut is None else use_in_shortcut)
+ self.use_in_shortcut = (
+ self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
+ )
self.conv_shortcut = None
if self.use_in_shortcut:
@@ -1054,14 +1026,14 @@ def __init__(
out_channels_prod,
kernel_size=1,
stride=1,
- padding=0, )
+ padding=0,
+ )
self.n_dim = len(self.channels_multidim)
def forward(self, input_tensor, temb=None):
shape = input_tensor.shape
- input_tensor = input_tensor.reshape(
- [*shape[0:-self.n_dim], self.in_channels_prod, 1, 1])
+ input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_channels_prod, 1, 1])
input_tensor = input_tensor.reshape([-1, self.in_channels_prod, 1, 1])
hidden_states = input_tensor
@@ -1072,8 +1044,7 @@ def forward(self, input_tensor, temb=None):
if temb is not None and self.time_emb_proj is not None:
if not self.pre_temb_non_linearity:
- temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None,
- None]
+ temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
else:
temb = self.time_emb_proj(temb)[:, :, None, None]
hidden_states = hidden_states + temb
@@ -1089,9 +1060,8 @@ def forward(self, input_tensor, temb=None):
output_tensor = input_tensor + hidden_states
- output_tensor = output_tensor.reshape([*shape[0:-self.n_dim], -1])
- output_tensor = output_tensor.reshape(
- [*shape[0:-self.n_dim], *self.out_channels_multidim])
+ output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], -1])
+ output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_channels_multidim])
return output_tensor
@@ -1099,21 +1069,22 @@ def forward(self, input_tensor, temb=None):
# Copied from ppdiffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
class DownBlockFlat(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_downsample: bool=True,
- downsample_padding: int=1,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
@@ -1131,19 +1102,24 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- LinearMultiDim(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ LinearMultiDim(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
@@ -1153,8 +1129,7 @@ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
- if (self.training and self.gradient_checkpointing and
- not hidden_states.stop_gradient):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
def create_custom_forward(module):
def custom_forward(*inputs):
@@ -1162,18 +1137,17 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
@@ -1181,27 +1155,28 @@ def custom_forward(*inputs):
# Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
class CrossAttnDownBlockFlat(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- downsample_padding: int=1,
- add_downsample: bool=True,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- only_cross_attention: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ downsample_padding: int = 1,
+ add_downsample: bool = True,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -1223,7 +1198,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
if not dual_cross_attention:
attentions.append(
Transformer2DModel(
@@ -1235,7 +1212,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -1244,32 +1223,38 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_downsample:
- self.downsamplers = nn.LayerList([
- LinearMultiDim(
- out_channels,
- use_conv=True,
- out_channels=out_channels,
- padding=downsample_padding,
- name="op", )
- ])
+ self.downsamplers = nn.LayerList(
+ [
+ LinearMultiDim(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ )
+ ]
+ )
else:
self.downsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None,
- additional_residuals=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ additional_residuals=None,
+ ):
# TODO(Patrick, William) - attention mask is not used
output_states = ()
@@ -1285,22 +1270,22 @@ def custom_forward(*inputs):
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
- cross_attention_kwargs, ) # [0]
+ cross_attention_kwargs,
+ ) # [0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
if additional_residuals is not None:
hidden_states += additional_residuals
@@ -1309,7 +1294,7 @@ def custom_forward(*inputs):
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states, )
+ output_states += (hidden_states,)
return hidden_states, output_states
@@ -1317,27 +1302,27 @@ def custom_forward(*inputs):
# Copied from ppdiffusers.models.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
class UpBlockFlat(nn.Layer):
def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -1352,31 +1337,25 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- LinearMultiDim(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
self.gradient_checkpointing = False
- def forward(self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- upsample_size=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
if self.training and self.gradient_checkpointing:
@@ -1386,8 +1365,7 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
hidden_states = resnet(hidden_states, temb)
@@ -1401,27 +1379,28 @@ def custom_forward(*inputs):
# Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
class CrossAttnUpBlockFlat(nn.Layer):
def __init__(
- self,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- cross_attention_dim: int=1280,
- output_scale_factor: float=1.0,
- add_upsample: bool=True,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- only_cross_attention: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ cross_attention_dim: int = 1280,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
resnets = []
attentions = []
@@ -1430,8 +1409,7 @@ def __init__(
self.attn_num_head_channels = attn_num_head_channels
for i in range(num_layers):
- res_skip_channels = in_channels if (
- i == num_layers - 1) else out_channels
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
resnets.append(
@@ -1446,7 +1424,9 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
if not dual_cross_attention:
attentions.append(
Transformer2DModel(
@@ -1458,7 +1438,9 @@ def __init__(
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -1467,36 +1449,35 @@ def __init__(
in_channels=out_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
if add_upsample:
- self.upsamplers = nn.LayerList([
- LinearMultiDim(
- out_channels, use_conv=True, out_channels=out_channels)
- ])
+ self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
self.gradient_checkpointing = False
def forward(
- self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None, ):
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ cross_attention_kwargs=None,
+ upsample_size=None,
+ attention_mask=None,
+ ):
# TODO(Patrick, William) - attention mask is not used
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
- hidden_states = paddle.concat(
- [hidden_states, res_hidden_states], axis=1)
+ hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
if self.training and self.gradient_checkpointing:
@@ -1509,20 +1490,20 @@ def custom_forward(*inputs):
return custom_forward
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
hidden_states = recompute(
- create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(
- attn, return_dict=False),
+ create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
- cross_attention_kwargs, ) # [0]
+ cross_attention_kwargs,
+ ) # [0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -1534,29 +1515,29 @@ def custom_forward(*inputs):
# Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
class UNetMidBlockFlatCrossAttn(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- cross_attention_dim: int=1280,
- dual_cross_attention: bool=False,
- use_linear_projection: bool=False,
- upcast_attention: bool=False,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ cross_attention_dim: int = 1280,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ upcast_attention: bool = False,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
# there is always at least one resnet
resnets = [
@@ -1571,7 +1552,8 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
]
attentions = []
@@ -1586,7 +1568,9 @@ def __init__(
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
- upcast_attention=upcast_attention, ))
+ upcast_attention=upcast_attention,
+ )
+ )
else:
attentions.append(
DualTransformer2DModel(
@@ -1595,7 +1579,9 @@ def __init__(
in_channels=in_channels,
num_layers=1,
cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups, ))
+ norm_num_groups=resnet_groups,
+ )
+ )
resnets.append(
ResnetBlockFlat(
in_channels=in_channels,
@@ -1608,24 +1594,28 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs, ).sample
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
hidden_states = resnet(hidden_states, temb)
return hidden_states
@@ -1634,30 +1624,30 @@ def forward(
# Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat
class UNetMidBlockFlatSimpleCrossAttn(nn.Layer):
def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float=0.0,
- num_layers: int=1,
- resnet_eps: float=1e-6,
- resnet_time_scale_shift: str="default",
- resnet_act_fn: str="swish",
- resnet_groups: int=32,
- resnet_pre_norm: bool=True,
- attn_num_head_channels: int=1,
- output_scale_factor: float=1.0,
- cross_attention_dim: int=1280,
- skip_time_act=False,
- only_cross_attention=False,
- cross_attention_norm=None,
- resnet_pre_temb_non_linearity: bool=False, ):
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels: int = 1,
+ output_scale_factor: float = 1.0,
+ cross_attention_dim: int = 1280,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
+ resnet_pre_temb_non_linearity: bool = False,
+ ):
super().__init__()
self.has_cross_attention = True
self.attn_num_head_channels = attn_num_head_channels
- resnet_groups = (resnet_groups if resnet_groups is not None else
- min(in_channels // 4, 32))
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
self.num_heads = in_channels // self.attn_num_head_channels
@@ -1674,7 +1664,8 @@ def __init__(
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
]
attentions = []
@@ -1696,7 +1687,9 @@ def __init__(
upcast_softmax=True,
only_cross_attention=only_cross_attention,
cross_attention_norm=cross_attention_norm,
- processor=processor, ))
+ processor=processor,
+ )
+ )
resnets.append(
ResnetBlockFlat(
in_channels=in_channels,
@@ -1710,20 +1703,22 @@ def __init__(
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
skip_time_act=skip_time_act,
- pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+ pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+ )
+ )
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None, ):
- cross_attention_kwargs = (cross_attention_kwargs if
- cross_attention_kwargs is not None else {})
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
# attn
@@ -1731,7 +1726,8 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
- **cross_attention_kwargs, )
+ **cross_attention_kwargs,
+ )
# resnet
hidden_states = resnet(hidden_states, temb)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
index c09df819c2b79..43a40201892a1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -18,21 +18,27 @@
import paddle
import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPTextModelWithProjection, CLIPTokenizer,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging
from ..pipeline_utils import DiffusionPipeline
from .modeling_text_unet import UNetFlatConditionModel
-from .pipeline_versatile_diffusion_dual_guided import \
- VersatileDiffusionDualGuidedPipeline
-from .pipeline_versatile_diffusion_image_variation import \
- VersatileDiffusionImageVariationPipeline
-from .pipeline_versatile_diffusion_text_to_image import \
- VersatileDiffusionTextToImagePipeline
+from .pipeline_versatile_diffusion_dual_guided import (
+ VersatileDiffusionDualGuidedPipeline,
+)
+from .pipeline_versatile_diffusion_image_variation import (
+ VersatileDiffusionImageVariationPipeline,
+)
+from .pipeline_versatile_diffusion_text_to_image import (
+ VersatileDiffusionTextToImagePipeline,
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -77,15 +83,16 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
scheduler: KarrasDiffusionSchedulers
def __init__(
- self,
- tokenizer: CLIPTokenizer,
- image_feature_extractor: CLIPImageProcessor,
- text_encoder: CLIPTextModelWithProjection,
- image_encoder: CLIPVisionModelWithProjection,
- image_unet: UNet2DConditionModel,
- text_unet: UNet2DConditionModel,
- vae: AutoencoderKL,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ tokenizer: CLIPTokenizer,
+ image_feature_extractor: CLIPImageProcessor,
+ text_encoder: CLIPTextModelWithProjection,
+ image_encoder: CLIPVisionModelWithProjection,
+ image_unet: UNet2DConditionModel,
+ text_unet: UNet2DConditionModel,
+ vae: AutoencoderKL,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
self.register_modules(
@@ -96,27 +103,28 @@ def __init__(
image_unet=image_unet,
text_unet=text_unet,
vae=vae,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@paddle.no_grad()
def image_variation(
- self,
- image: Union[paddle.Tensor, PIL.Image.Image],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ image: Union[paddle.Tensor, PIL.Image.Image],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -194,13 +202,8 @@ def image_variation(
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
- expected_components = inspect.signature(
- VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
return VersatileDiffusionImageVariationPipeline(**components)(
image=image,
height=height,
@@ -215,26 +218,27 @@ def image_variation(
output_type=output_type,
return_dict=return_dict,
callback=callback,
- callback_steps=callback_steps, )
+ callback_steps=callback_steps,
+ )
@paddle.no_grad()
def text_to_image(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -303,13 +307,8 @@ def text_to_image(
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
- expected_components = inspect.signature(
- VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
output = temp_pipeline(
prompt=prompt,
@@ -325,7 +324,8 @@ def text_to_image(
output_type=output_type,
return_dict=return_dict,
callback=callback,
- callback_steps=callback_steps, )
+ callback_steps=callback_steps,
+ )
# swap the attention blocks back to the original state
temp_pipeline._swap_unet_attention_blocks()
@@ -333,23 +333,23 @@ def text_to_image(
@paddle.no_grad()
def dual_guided(
- self,
- prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
- image: Union[str, List[str]],
- text_to_image_strength: float=0.5,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ):
+ self,
+ prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+ image: Union[str, List[str]],
+ text_to_image_strength: float = 0.5,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -431,13 +431,8 @@ def dual_guided(
returning a tuple, the first element is a list with the generated images.
"""
- expected_components = inspect.signature(
- VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
- components = {
- name: component
- for name, component in self.components.items()
- if name in expected_components
- }
+ expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
+ components = {name: component for name, component in self.components.items() if name in expected_components}
temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
output = temp_pipeline(
prompt=prompt,
@@ -454,7 +449,8 @@ def dual_guided(
output_type=output_type,
return_dict=return_dict,
callback=callback,
- callback_steps=callback_steps, )
+ callback_steps=callback_steps,
+ )
temp_pipeline._revert_dual_attention()
return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index a47088e2f9411..faf4c4f7232ed 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -19,12 +19,19 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPTextModelWithProjection, CLIPTokenizer,
- CLIPVisionModelWithProjection)
-
-from ...models import (AutoencoderKL, DualTransformer2DModel,
- Transformer2DModel, UNet2DConditionModel)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+ AutoencoderKL,
+ DualTransformer2DModel,
+ Transformer2DModel,
+ UNet2DConditionModel,
+)
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging, randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -74,15 +81,16 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
_optional_components = ["text_unet"]
def __init__(
- self,
- tokenizer: CLIPTokenizer,
- image_feature_extractor: CLIPImageProcessor,
- text_encoder: CLIPTextModelWithProjection,
- image_encoder: CLIPVisionModelWithProjection,
- image_unet: UNet2DConditionModel,
- text_unet: UNetFlatConditionModel,
- vae: AutoencoderKL,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ tokenizer: CLIPTokenizer,
+ image_feature_extractor: CLIPImageProcessor,
+ text_encoder: CLIPTextModelWithProjection,
+ image_encoder: CLIPVisionModelWithProjection,
+ image_unet: UNet2DConditionModel,
+ text_unet: UNetFlatConditionModel,
+ vae: AutoencoderKL,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
self.register_modules(
tokenizer=tokenizer,
@@ -92,12 +100,13 @@ def __init__(
image_unet=image_unet,
text_unet=text_unet,
vae=vae,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
if self.text_unet is not None and (
- "dual_cross_attention" not in self.image_unet.config or
- not self.image_unet.config.dual_cross_attention):
+ "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
+ ):
# if loading from a universal checkpoint rather than a saved dual-guided pipeline
self._convert_to_dual_attention()
@@ -114,10 +123,8 @@ def _convert_to_dual_attention(self):
parent_name, index = name.rsplit(".", 1)
index = int(index)
- image_transformer = self.image_unet.get_sublayer(parent_name)[
- index]
- text_transformer = self.text_unet.get_sublayer(parent_name)[
- index]
+ image_transformer = self.image_unet.get_sublayer(parent_name)[index]
+ text_transformer = self.text_unet.get_sublayer(parent_name)[index]
config = image_transformer.config
dual_transformer = DualTransformer2DModel(
@@ -132,12 +139,12 @@ def _convert_to_dual_attention(self):
sample_size=config.sample_size,
num_vector_embeds=config.num_vector_embeds,
activation_fn=config.activation_fn,
- num_embeds_ada_norm=config.num_embeds_ada_norm, )
+ num_embeds_ada_norm=config.num_embeds_ada_norm,
+ )
dual_transformer.transformers[0] = image_transformer
dual_transformer.transformers[1] = text_transformer
- self.image_unet.get_sublayer(parent_name)[
- index] = dual_transformer
+ self.image_unet.get_sublayer(parent_name)[index] = dual_transformer
self.image_unet.register_to_config(dual_cross_attention=True)
def _revert_dual_attention(self):
@@ -149,12 +156,10 @@ def _revert_dual_attention(self):
if isinstance(module, DualTransformer2DModel):
parent_name, index = name.rsplit(".", 1)
index = int(index)
- self.image_unet.get_sublayer(parent_name)[
- index] = module.transformers[0]
+ self.image_unet.get_sublayer(parent_name)[index] = module.transformers[0]
self.image_unet.register_to_config(dual_cross_attention=False)
- def _encode_text_prompt(self, prompt, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_text_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
r"""
Encodes the prompt into text encoder hidden states.
@@ -168,11 +173,9 @@ def _encode_text_prompt(self, prompt, num_images_per_prompt,
"""
def normalize_embeddings(encoder_output):
- embeds = paddle.matmul(encoder_output.last_hidden_state,
- self.text_encoder.text_projection)
+ embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
embeds_pooled = encoder_output.text_embeds
- embeds = embeds / paddle.norm(
- embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
+ embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
return embeds
batch_size = len(prompt)
@@ -182,35 +185,35 @@ def normalize_embeddings(encoder_output):
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = normalize_embeddings(prompt_embeds)
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
@@ -221,37 +224,33 @@ def normalize_embeddings(encoder_output):
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
- negative_prompt_embeds = normalize_embeddings(
- negative_prompt_embeds)
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
- def _encode_image_prompt(self, prompt, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_image_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
r"""
Encodes the prompt into vision encoder hidden states.
@@ -265,8 +264,7 @@ def _encode_image_prompt(self, prompt, num_images_per_prompt,
"""
def normalize_embeddings(encoder_output):
- embeds = self.image_encoder.vision_model.ln_post(
- encoder_output.last_hidden_state)
+ embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
embeds_pooled = embeds[:, 0:1]
embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
@@ -275,8 +273,7 @@ def normalize_embeddings(encoder_output):
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
- image_input = self.image_feature_extractor(
- images=prompt, return_tensors="pd")
+ image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
image_embeddings = self.image_encoder(pixel_values)
image_embeddings = normalize_embeddings(image_embeddings)
@@ -284,32 +281,25 @@ def normalize_embeddings(encoder_output):
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
- image_embeddings = image_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
- uncond_images = self.image_feature_extractor(
- images=uncond_images, return_tensors="pd")
- pixel_values = uncond_images.pixel_values.cast(
- self.image_encoder.dtype)
+ uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
+ pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
negative_prompt_embeds = self.image_encoder(pixel_values)
- negative_prompt_embeds = normalize_embeddings(
- negative_prompt_embeds)
+ negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and conditional embeddings into a single batch
# to avoid doing two forward passes
- image_embeddings = paddle.concat(
- [negative_prompt_embeds, image_embeddings])
+ image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
return image_embeddings
@@ -329,60 +319,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(self, prompt, image, height, width, callback_steps):
- if (not isinstance(prompt, str) and
- not isinstance(prompt, PIL.Image.Image) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}"
- )
- if (not isinstance(image, str) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
- raise ValueError(
- f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}"
- )
+ if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
+ raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
+ if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
+ raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = (
batch_size,
num_channels_latents,
height // self.vae_scale_factor,
- width // self.vae_scale_factor, )
+ width // self.vae_scale_factor,
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -396,44 +377,39 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- def set_transformer_params(self,
- mix_ratio: float=0.5,
- condition_types: Tuple=("text", "image")):
+ def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
for name, module in self.image_unet.named_sublayers(include_self=True):
if isinstance(module, DualTransformer2DModel):
module.mix_ratio = mix_ratio
for i, type in enumerate(condition_types):
if type == "text":
- module.condition_lengths[
- i] = self.text_encoder.config.max_position_embeddings
- module.transformer_index_for_condition[
- i] = 1 # use the second (text) transformer
+ module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
+ module.transformer_index_for_condition[i] = 1 # use the second (text) transformer
else:
module.condition_lengths[i] = 257
- module.transformer_index_for_condition[
- i] = 0 # use the first (image) transformer
+ module.transformer_index_for_condition[i] = 0 # use the first (image) transformer
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
- image: Union[str, List[str]],
- text_to_image_strength: float=0.5,
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+ image: Union[str, List[str]],
+ text_to_image_strength: float = 0.5,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -532,12 +508,9 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompts
- prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt,
- do_classifier_free_guidance)
- image_embeddings = self._encode_image_prompt(
- image, num_images_per_prompt, do_classifier_free_guidance)
- dual_prompt_embeddings = paddle.concat(
- [prompt_embeds, image_embeddings], axis=1)
+ prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
+ image_embeddings = self._encode_image_prompt(image, num_images_per_prompt, do_classifier_free_guidance)
+ dual_prompt_embeddings = paddle.concat([prompt_embeds, image_embeddings], axis=1)
prompt_types = ("text", "image")
# 4. Prepare timesteps
@@ -553,7 +526,8 @@ def __call__(
width,
dual_prompt_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -564,26 +538,19 @@ def __call__(
# 8. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.image_unet(
- latent_model_input,
- t,
- encoder_hidden_states=dual_prompt_embeddings).sample
+ noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -597,6 +564,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 668f748dfa42a..fc9d645fc7991 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -19,8 +19,7 @@
import numpy as np
import paddle
import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -57,27 +56,30 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
scheduler: KarrasDiffusionSchedulers
def __init__(
- self,
- image_feature_extractor: CLIPImageProcessor,
- image_encoder: CLIPVisionModelWithProjection,
- image_unet: UNet2DConditionModel,
- vae: AutoencoderKL,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ image_feature_extractor: CLIPImageProcessor,
+ image_encoder: CLIPVisionModelWithProjection,
+ image_unet: UNet2DConditionModel,
+ vae: AutoencoderKL,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
self.register_modules(
image_feature_extractor=image_feature_extractor,
image_encoder=image_encoder,
image_unet=image_unet,
vae=vae,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
def _encode_image_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -94,8 +96,7 @@ def _encode_image_prompt(
"""
def normalize_embeddings(encoder_output):
- embeds = self.image_encoder.vision_model.ln_post(
- encoder_output.last_hidden_state)
+ embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
embeds_pooled = embeds[:, 0:1]
embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
@@ -107,8 +108,7 @@ def normalize_embeddings(encoder_output):
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
- image_input = self.image_feature_extractor(
- images=prompt, return_tensors="pd")
+ image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
image_embeddings = self.image_encoder(pixel_values)
image_embeddings = normalize_embeddings(image_embeddings)
@@ -116,8 +116,7 @@ def normalize_embeddings(encoder_output):
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
- image_embeddings = image_embeddings.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
@@ -127,37 +126,33 @@ def normalize_embeddings(encoder_output):
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, PIL.Image.Image):
uncond_images = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_images = negative_prompt
- uncond_images = self.image_feature_extractor(
- images=uncond_images, return_tensors="pd")
- pixel_values = uncond_images.pixel_values.cast(
- self.image_encoder.dtype)
+ uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
+ pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
negative_prompt_embeds = self.image_encoder(pixel_values)
- negative_prompt_embeds = normalize_embeddings(
- negative_prompt_embeds)
+ negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and conditional embeddings into a single batch
# to avoid doing two forward passes
- image_embeddings = paddle.concat(
- [negative_prompt_embeds, image_embeddings])
+ image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
return image_embeddings
@@ -177,50 +172,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
def check_inputs(self, image, height, width, callback_steps):
- if (not isinstance(image, paddle.Tensor) and
- not isinstance(image, PIL.Image.Image) and
- not isinstance(image, list)):
+ if (
+ not isinstance(image, paddle.Tensor)
+ and not isinstance(image, PIL.Image.Image)
+ and not isinstance(image, list)
+ ):
raise ValueError(
"`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
- f" {type(image)}")
+ f" {type(image)}"
+ )
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -242,23 +238,23 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -352,8 +348,8 @@ def __call__(
# 3. Encode input prompt
image_embeddings = self._encode_image_prompt(
- image, num_images_per_prompt, do_classifier_free_guidance,
- negative_prompt)
+ image, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -368,7 +364,8 @@ def __call__(
width,
image_embeddings.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -376,25 +373,19 @@ def __call__(
# 7. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.image_unet(
- latent_model_input, t,
- encoder_hidden_states=image_embeddings).sample
+ noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -408,6 +399,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index 1524df9f993ed..0d4999c94b24c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -17,8 +17,11 @@
from typing import Callable, List, Optional, Union
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor,
- CLIPTextModelWithProjection, CLIPTokenizer)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -67,13 +70,14 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
_optional_components = ["text_unet"]
def __init__(
- self,
- tokenizer: CLIPTokenizer,
- text_encoder: CLIPTextModelWithProjection,
- image_unet: UNet2DConditionModel,
- text_unet: UNetFlatConditionModel,
- vae: AutoencoderKL,
- scheduler: KarrasDiffusionSchedulers, ):
+ self,
+ tokenizer: CLIPTokenizer,
+ text_encoder: CLIPTextModelWithProjection,
+ image_unet: UNet2DConditionModel,
+ text_unet: UNetFlatConditionModel,
+ vae: AutoencoderKL,
+ scheduler: KarrasDiffusionSchedulers,
+ ):
super().__init__()
self.register_modules(
tokenizer=tokenizer,
@@ -81,8 +85,9 @@ def __init__(
image_unet=image_unet,
text_unet=text_unet,
vae=vae,
- scheduler=scheduler, )
- self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+ scheduler=scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
if self.text_unet is not None:
self._swap_unet_attention_blocks()
@@ -97,19 +102,22 @@ def _swap_unet_attention_blocks(self):
index = int(index)
(
self.image_unet.get_sublayer(parent_name)[index],
- self.text_unet.get_sublayer(parent_name)[index], ) = (
- self.text_unet.get_sublayer(parent_name)[index],
- self.image_unet.get_sublayer(parent_name)[index], )
+ self.text_unet.get_sublayer(parent_name)[index],
+ ) = (
+ self.text_unet.get_sublayer(parent_name)[index],
+ self.image_unet.get_sublayer(parent_name)[index],
+ )
def remove_unused_weights(self):
self.register_modules(text_unet=None)
def _encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt, ):
+ self,
+ prompt,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -126,11 +134,9 @@ def _encode_prompt(
"""
def normalize_embeddings(encoder_output):
- embeds = paddle.matmul(encoder_output.last_hidden_state,
- self.text_encoder.text_projection)
+ embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
embeds_pooled = encoder_output.text_embeds
- embeds = embeds / paddle.norm(
- embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
+ embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
return embeds
batch_size = len(prompt) if isinstance(prompt, list) else 1
@@ -140,35 +146,35 @@ def normalize_embeddings(encoder_output):
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(
- prompt, padding="longest", return_tensors="pd").input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
- -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
- attention_mask=attention_mask, )
+ attention_mask=attention_mask,
+ )
prompt_embeds = normalize_embeddings(prompt_embeds)
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
@@ -178,14 +184,16 @@ def normalize_embeddings(encoder_output):
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}.")
+ f" {type(prompt)}."
+ )
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`.")
+ " the batch size of `prompt`."
+ )
else:
uncond_tokens = negative_prompt
@@ -195,32 +203,29 @@ def normalize_embeddings(encoder_output):
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
- if (hasattr(self.text_encoder.config, "use_attention_mask") and
- self.text_encoder.config.use_attention_mask):
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = uncond_input.attention_mask
else:
attention_mask = None
negative_prompt_embeds = self.text_encoder(
uncond_input.input_ids,
- attention_mask=attention_mask, )
- negative_prompt_embeds = normalize_embeddings(
- negative_prompt_embeds)
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@@ -240,54 +245,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
- accepts_eta = "eta" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
- accepts_generator = "generator" in set(
- inspect.signature(self.scheduler.step).parameters.keys())
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None, ):
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
if height % 8 != 0 or width % 8 != 0:
- raise ValueError(
- f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
- )
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two.")
+ " only forward one of the two."
+ )
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
- elif prompt is not None and (not isinstance(prompt, str) and
- not isinstance(prompt, list)):
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
@@ -300,18 +301,20 @@ def check_inputs(
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}.")
+ f" {negative_prompt_embeds.shape}."
+ )
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(
- self,
- batch_size,
- num_channels_latents,
- height,
- width,
- dtype,
- generator,
- latents=None, ):
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ generator,
+ latents=None,
+ ):
shape = [
batch_size,
num_channels_latents,
@@ -333,23 +336,23 @@ def prepare_latents(
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- height: Optional[int]=None,
- width: Optional[int]=None,
- num_inference_steps: int=50,
- guidance_scale: float=7.5,
- negative_prompt: Optional[Union[str, List[str]]]=None,
- num_images_per_prompt: Optional[int]=1,
- eta: float=0.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1,
- **kwargs, ):
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
r"""
Function invoked when calling the pipeline for generation.
@@ -434,9 +437,9 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
- prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt)
+ prompt_embeds = self._encode_prompt(
+ prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -451,7 +454,8 @@ def __call__(
width,
prompt_embeds.dtype,
generator,
- latents, )
+ latents,
+ )
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -459,25 +463,19 @@ def __call__(
# 7. Denoising loop
for i, t in enumerate(self.progress_bar(timesteps)):
# expand the latents if we are doing classifier free guidance
- latent_model_input = (paddle.concat([latents] * 2)
- if do_classifier_free_guidance else latents)
- latent_model_input = self.scheduler.scale_model_input(
- latent_model_input, t)
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.image_unet(
- latent_model_input, t,
- encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (
- noise_pred_text - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents,
- **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -491,6 +489,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
index 4a1b00a7eb0fa..f7426c40427c0 100644
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
@@ -17,5 +17,7 @@
from ...utils import is_paddle_available, is_paddlenlp_available
if is_paddle_available() and is_paddlenlp_available():
- from .pipeline_vq_diffusion import (LearnedClassifierFreeSamplingEmbeddings,
- VQDiffusionPipeline)
+ from .pipeline_vq_diffusion import (
+ LearnedClassifierFreeSamplingEmbeddings,
+ VQDiffusionPipeline,
+ )
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
index f8d1fc09518db..e97be223237f9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -42,23 +42,23 @@ class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- learnable: bool,
- hidden_size: Optional[int]=None,
- length: Optional[int]=None, ):
+ self,
+ learnable: bool,
+ hidden_size: Optional[int] = None,
+ length: Optional[int] = None,
+ ):
super().__init__()
self.learnable = learnable
if self.learnable:
- assert (hidden_size is not None
- ), "learnable=True requires `hidden_size` to be set"
+ assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
assert length is not None, "learnable=True requires `length` to be set"
embeddings = paddle.zeros([length, hidden_size])
self.embeddings = self.create_parameter(
- embeddings.shape,
- default_initializer=nn.initializer.Assign(embeddings))
+ embeddings.shape, default_initializer=nn.initializer.Assign(embeddings)
+ )
else:
self.embeddings = None
@@ -95,13 +95,13 @@ class VQDiffusionPipeline(DiffusionPipeline):
scheduler: VQDiffusionScheduler
def __init__(
- self,
- vqvae: VQModel,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- transformer: Transformer2DModel,
- scheduler: VQDiffusionScheduler,
- learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
+ self,
+ vqvae: VQModel,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ transformer: Transformer2DModel,
+ scheduler: VQDiffusionScheduler,
+ learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
):
super().__init__()
@@ -114,8 +114,7 @@ def __init__(
learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
)
- def _encode_prompt(self, prompt, num_images_per_prompt,
- do_classifier_free_guidance):
+ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
@@ -123,16 +122,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_input_ids = text_inputs.input_ids
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
- removed_text = self.tokenizer.batch_decode(
- text_input_ids[:, self.tokenizer.model_max_length:])
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}")
- text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
prompt_embeds = self.text_encoder(text_input_ids)[0]
# NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
@@ -141,21 +141,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
#
# CLIP normalizing the pooled output.
# https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
- prompt_embeds = prompt_embeds / prompt_embeds.norm(
- axis=-1, keepdim=True)
+ prompt_embeds = prompt_embeds / prompt_embeds.norm(axis=-1, keepdim=True)
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
- prompt_embeds = prompt_embeds.reshape(
- [bs_embed * num_images_per_prompt, seq_len, -1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
if self.learned_classifier_free_sampling_embeddings.learnable:
- negative_prompt_embeds = (
- self.learned_classifier_free_sampling_embeddings.embeddings)
- negative_prompt_embeds = negative_prompt_embeds.unsqueeze(
- 0).tile([batch_size, 1, 1])
+ negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings
+ negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).tile([batch_size, 1, 1])
else:
uncond_tokens = [""] * batch_size
@@ -165,45 +161,39 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
padding="max_length",
max_length=max_length,
truncation=True,
- return_tensors="pd", )
- negative_prompt_embeds = self.text_encoder(
- uncond_input.input_ids)[0]
+ return_tensors="pd",
+ )
+ negative_prompt_embeds = self.text_encoder(uncond_input.input_ids)[0]
# See comment for normalizing text embeddings
- negative_prompt_embeds = (negative_prompt_embeds /
- negative_prompt_embeds.norm(
- axis=-1, keepdim=True))
+ negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(axis=-1, keepdim=True)
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
- negative_prompt_embeds = negative_prompt_embeds.tile(
- [1, num_images_per_prompt, 1])
- negative_prompt_embeds = negative_prompt_embeds.reshape(
- [batch_size * num_images_per_prompt, seq_len, -1])
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- prompt_embeds = paddle.concat(
- [negative_prompt_embeds, prompt_embeds])
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
return prompt_embeds
@paddle.no_grad()
def __call__(
- self,
- prompt: Union[str, List[str]],
- num_inference_steps: int=100,
- guidance_scale: float=5.0,
- truncation_rate: float=1.0,
- num_images_per_prompt: int=1,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- latents: Optional[paddle.Tensor]=None,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
- callback_steps: Optional[int]=1, ) -> Union[ImagePipelineOutput,
- Tuple]:
+ self,
+ prompt: Union[str, List[str]],
+ num_inference_steps: int = 100,
+ guidance_scale: float = 5.0,
+ truncation_rate: float = 1.0,
+ num_images_per_prompt: int = 1,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ) -> Union[ImagePipelineOutput, Tuple]:
"""
Function invoked when calling the pipeline for generation.
@@ -252,23 +242,21 @@ def __call__(
elif isinstance(prompt, list):
batch_size = len(prompt)
else:
- raise ValueError(
- f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
- )
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
batch_size = batch_size * num_images_per_prompt
do_classifier_free_guidance = guidance_scale > 1.0
- prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt,
- do_classifier_free_guidance)
+ prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
if (callback_steps is None) or (
- callback_steps is not None and
- (not isinstance(callback_steps, int) or callback_steps <= 0)):
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}.")
+ f" {type(callback_steps)}."
+ )
# get the initial completely masked latents unless the user supplied it
@@ -278,14 +266,12 @@ def __call__(
latents = paddle.full(latents_shape, mask_class, dtype="int64")
else:
if latents.shape != latents_shape:
- raise ValueError(
- f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
- )
- if (latents < 0).any() or (
- latents >= self.transformer.num_vector_embeds).any():
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+ if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
raise ValueError(
"Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
- f" {self.transformer.num_vector_embeds - 1} (inclusive).")
+ f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+ )
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -296,20 +282,15 @@ def __call__(
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
# expand the sample if we are doing classifier free guidance
- latent_model_input = (paddle.concat([sample] * 2)
- if do_classifier_free_guidance else sample)
+ latent_model_input = paddle.concat([sample] * 2) if do_classifier_free_guidance else sample
# predict the un-noised image
# model_output == `log_p_x_0`
- model_output = self.transformer(
- latent_model_input,
- encoder_hidden_states=prompt_embeds,
- timestep=t).sample
+ model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample
if do_classifier_free_guidance:
model_output_uncond, model_output_text = model_output.chunk(2)
- model_output = model_output_uncond + guidance_scale * (
- model_output_text - model_output_uncond)
+ model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
model_output -= logsumexp(model_output, axis=1, keepdim=True)
model_output = self.truncate(model_output, truncation_rate)
@@ -318,9 +299,7 @@ def __call__(
model_output = model_output.clip(-70)
# compute the previous noisy sample x_t -> x_t-1
- sample = self.scheduler.step(
- model_output, timestep=t, sample=sample,
- generator=generator).prev_sample
+ sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
# call the callback, if provided
if callback is not None and i % callback_steps == 0:
@@ -331,9 +310,9 @@ def __call__(
batch_size,
self.transformer.height,
self.transformer.width,
- embedding_channels, )
- embeddings = self.vqvae.quantize.get_codebook_entry(
- sample, shape=embeddings_shape)
+ embedding_channels,
+ )
+ embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
image = (image / 2 + 0.5).clip(0, 1)
@@ -343,34 +322,29 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, )
+ return (image,)
return ImagePipelineOutput(images=image)
- def truncate(self, log_p_x_0: paddle.Tensor,
- truncation_rate: float) -> paddle.Tensor:
+ def truncate(self, log_p_x_0: paddle.Tensor, truncation_rate: float) -> paddle.Tensor:
"""
Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The
lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero.
"""
- sorted_log_p_x_0, indices = paddle.topk(
- log_p_x_0, k=log_p_x_0.shape[1], axis=1)
+ sorted_log_p_x_0, indices = paddle.topk(log_p_x_0, k=log_p_x_0.shape[1], axis=1)
sorted_p_x_0 = paddle.exp(sorted_log_p_x_0)
- keep_mask = (
- sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64")
+ keep_mask = (sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64")
# Ensure that at least the largest probability is not zeroed out
all_true = paddle.full_like(keep_mask[:, 0:1, :], 1)
keep_mask = paddle.concat((all_true, keep_mask), axis=1)
keep_mask = keep_mask[:, :-1, :]
- keep_mask = paddle.take_along_axis(
- keep_mask, indices.argsort(1),
- axis=1).cast("bool") # keep_mask.gather(indices.argsort(1), axis=1)
+ keep_mask = paddle.take_along_axis(keep_mask, indices.argsort(1), axis=1).cast(
+ "bool"
+ ) # keep_mask.gather(indices.argsort(1), axis=1)
rv = log_p_x_0.clone()
# rv[~keep_mask] = -INF # -inf = log(0)
- rv = paddle.where(
- keep_mask, rv, paddle.to_tensor(
- -INF, dtype="float32"))
+ rv = paddle.where(keep_mask, rv, paddle.to_tensor(-INF, dtype="float32"))
return rv
diff --git a/ppdiffusers/ppdiffusers/schedulers/__init__.py b/ppdiffusers/ppdiffusers/schedulers/__init__.py
index dd064c0187497..682e58fcc57df 100644
--- a/ppdiffusers/ppdiffusers/schedulers/__init__.py
+++ b/ppdiffusers/ppdiffusers/schedulers/__init__.py
@@ -13,8 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ..utils import (OptionalDependencyNotAvailable, is_paddle_available,
- is_scipy_available)
+from ..utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_scipy_available,
+)
try:
if not is_paddle_available():
@@ -22,8 +25,9 @@
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_objects import * # noqa F403
else:
- from .preconfig.preconfig_scheduling_euler_ancestral_discrete import \
- PreconfigEulerAncestralDiscreteScheduler
+ from .preconfig.preconfig_scheduling_euler_ancestral_discrete import (
+ PreconfigEulerAncestralDiscreteScheduler,
+ )
from .scheduling_ddim import DDIMScheduler
from .scheduling_ddim_inverse import DDIMInverseScheduler
from .scheduling_ddpm import DDPMScheduler
@@ -31,13 +35,11 @@
from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
from .scheduling_dpmsolver_unidiffuser import DPMSolverUniDiffuserScheduler
- from .scheduling_euler_ancestral_discrete import \
- EulerAncestralDiscreteScheduler
+ from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
from .scheduling_euler_discrete import EulerDiscreteScheduler
from .scheduling_heun_discrete import HeunDiscreteScheduler
from .scheduling_ipndm import IPNDMScheduler
- from .scheduling_k_dpm_2_ancestral_discrete import \
- KDPM2AncestralDiscreteScheduler
+ from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
from .scheduling_karras_ve import KarrasVeScheduler
from .scheduling_pndm import PNDMScheduler
@@ -55,6 +57,7 @@
except OptionalDependencyNotAvailable:
from ..utils.dummy_paddle_and_scipy_objects import * # noqa F403
else:
- from .preconfig.preconfig_scheduling_lms_discrete import \
- PreconfigLMSDiscreteScheduler
+ from .preconfig.preconfig_scheduling_lms_discrete import (
+ PreconfigLMSDiscreteScheduler,
+ )
from .scheduling_lms_discrete import LMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
index 0af0ad582bd99..ecff93753b32d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
@@ -14,8 +14,11 @@
# limitations under the License.
# flake8: noqa
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
- is_scipy_available)
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_scipy_available,
+)
try:
if not is_paddle_available():
@@ -23,13 +26,13 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_objects import * # noqa F403
else:
- from .preconfig_scheduling_euler_ancestral_discrete import \
- PreconfigEulerAncestralDiscreteScheduler
+ from .preconfig_scheduling_euler_ancestral_discrete import (
+ PreconfigEulerAncestralDiscreteScheduler,
+ )
try:
if not (is_paddle_available() and is_scipy_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_paddle_and_scipy_objects import * # noqa F403
else:
- from .preconfig_scheduling_lms_discrete import \
- PreconfigLMSDiscreteScheduler
+ from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
index 53de9a57c4178..a925526d76b33 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
@@ -47,8 +47,7 @@ class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput):
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -108,38 +107,40 @@ class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon",
- preconfig: bool=True, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ preconfig: bool = True,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -148,18 +149,15 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
self.is_scale_input_called = False
self.preconfig = preconfig
self.step_index_offset = 0
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(
+ self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+ ) -> paddle.Tensor:
"""
Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
@@ -178,7 +176,7 @@ def scale_model_input(self,
if not self.preconfig:
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
return sample
else:
if step_index > (len(self.latent_scales) - 1):
@@ -196,13 +194,8 @@ def set_timesteps(self, num_inference_steps: int):
self.num_inference_steps = num_inference_steps
self.step_index_offset = 0
- timesteps = np.linspace(
- 0,
- self.config.num_train_timesteps - 1,
- num_inference_steps,
- dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -213,23 +206,21 @@ def set_timesteps(self, num_inference_steps: int):
for step_index_i in range(len(self.timesteps)):
sigma_from = self.sigmas[step_index_i]
sigma_to = self.sigmas[step_index_i + 1]
- sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) /
- sigma_from**2)**0.5
- sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
self.sigma_up.append(sigma_up)
self.sigma_down.append(sigma_down)
- self.latent_scales = 1 / ((self.sigmas**2 + 1)**0.5)
+ self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5)
def step(
- self,
- model_output: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- sample: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True,
- **kwargs, ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput,
- Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -251,7 +242,8 @@ def step(
if not self.is_scale_input_called:
logger.warning(
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
- "See `StableDiffusionPipeline` for a usage example.")
+ "See `StableDiffusionPipeline` for a usage example."
+ )
if kwargs.get("return_pred_original_sample") is not None:
return_pred_original_sample = kwargs["return_pred_original_sample"]
else:
@@ -270,11 +262,9 @@ def step(
pred_original_sample = sample - sigma * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
- pred_original_sample = model_output * (-sigma / (
- sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1))
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
elif self.config.prediction_type == "sample":
- raise NotImplementedError(
- "prediction_type not implemented yet: sample")
+ raise NotImplementedError("prediction_type not implemented yet: sample")
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -283,38 +273,37 @@ def step(
if not self.preconfig:
sigma_from = self.sigmas[step_index]
sigma_to = self.sigmas[step_index + 1]
- sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from
- **2)**0.5
- sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
else:
sigma_up = self.sigma_up[step_index]
sigma_down = self.sigma_down[step_index]
# 2. Convert to an ODE derivative
dt = sigma_down - sigma
prev_sample = sample + derivative * dt
- noise = randn_tensor(
- model_output.shape, dtype=model_output.dtype, generator=generator)
+ noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
prev_sample = prev_sample + noise * sigma_up
if not return_dict:
if not return_pred_original_sample:
- return (prev_sample, )
+ return (prev_sample,)
else:
return (prev_sample, pred_original_sample)
return PreconfigEulerAncestralDiscreteSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ prev_sample=prev_sample, pred_original_sample=pred_original_sample
+ )
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
self.sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [(schedule_timesteps == t).nonzero().item()
- for t in timesteps]
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
sigma = self.sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
index dd6c73e2e7250..16f74fcb6860f 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -106,38 +106,40 @@ class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon",
- preconfig=True, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ preconfig=True,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -146,18 +148,15 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
self.derivatives = []
self.is_scale_input_called = False
self.preconfig = preconfig
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(
+ self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+ ) -> paddle.Tensor:
"""
Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
@@ -175,7 +174,7 @@ def scale_model_input(self,
self.is_scale_input_called = True
if not self.preconfig:
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
return sample
else:
return sample * self.latent_scales[step_index]
@@ -195,16 +194,14 @@ def lms_derivative(tau):
for k in range(order):
if current_order == k:
continue
- prod *= (tau - self.sigmas[t - k]) / (
- self.sigmas[t - current_order] - self.sigmas[t - k])
+ prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
return prod
- integrated_coeff = integrate.quad(
- lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+ integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
return integrated_coeff
- def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
+ def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4):
"""
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -214,13 +211,8 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
"""
self.num_inference_steps = num_inference_steps
- timesteps = np.linspace(
- 0,
- self.config.num_train_timesteps - 1,
- num_inference_steps,
- dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -230,24 +222,22 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
if self.preconfig:
self.order = preconfig_order
self.lms_coeffs = []
- self.latent_scales = [
- 1.0 / ((sigma**2 + 1)**0.5) for sigma in self.sigmas
- ]
+ self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
for step_index in range(self.num_inference_steps):
order = min(step_index + 1, preconfig_order)
- self.lms_coeffs.append([
- self.get_lms_coefficient(order, step_index, curr_order)
- for curr_order in range(order)
- ])
+ self.lms_coeffs.append(
+ [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
+ )
def step(
- self,
- model_output: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- sample: paddle.Tensor,
- order: int=4,
- return_dict: bool=True,
- **kwargs, ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ order: int = 4,
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -272,7 +262,8 @@ def step(
if not self.is_scale_input_called:
warnings.warn(
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
- "See `StableDiffusionPipeline` for a usage example.")
+ "See `StableDiffusionPipeline` for a usage example."
+ )
if kwargs.get("return_pred_original_sample") is not None:
return_pred_original_sample = kwargs["return_pred_original_sample"]
else:
@@ -292,8 +283,7 @@ def step(
pred_original_sample = sample - sigma * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
- pred_original_sample = model_output * (-sigma / (
- sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1))
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
else:
@@ -310,42 +300,37 @@ def step(
if not self.preconfig:
# 3. If not preconfiged, compute linear multistep coefficients.
order = min(step_index + 1, order)
- lms_coeffs = [
- self.get_lms_coefficient(order, step_index, curr_order)
- for curr_order in range(order)
- ]
+ lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
# 4. Compute previous sample based on the derivatives path
prev_sample = sample + sum(
- coeff * derivative
- for coeff, derivative in zip(lms_coeffs,
- reversed(self.derivatives)))
+ coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+ )
else:
# 3. If preconfiged, direct compute previous sample based on the derivatives path
prev_sample = sample + sum(
coeff * derivative
- for coeff, derivative in zip(self.lms_coeffs[step_index],
- reversed(self.derivatives)))
+ for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives))
+ )
if not return_dict:
if not return_pred_original_sample:
- return (prev_sample, )
+ return (prev_sample,)
else:
return (prev_sample, pred_original_sample)
- return PreconfigLMSDiscreteSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [(schedule_timesteps == t).nonzero().item()
- for t in timesteps]
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
index b4929d761f687..9bb46c472ca10 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
@@ -48,8 +48,7 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -68,7 +67,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -131,38 +130,41 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- clip_sample: bool=True,
- set_alpha_to_one: bool=True,
- steps_offset: int=0,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- clip_sample_range: float=1.0,
- sample_max_value: float=1.0, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ clip_sample: bool = True,
+ set_alpha_to_one: bool = True,
+ steps_offset: int = 0,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ clip_sample_range: float = 1.0,
+ sample_max_value: float = 1.0,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype="float32")
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype="float32")
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype="float32", )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype="float32",
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -171,20 +173,16 @@ def __init__(
# For the final step, there is no previous alphas_cumprod because we are already at 0
# `set_alpha_to_one` decides whether we set this parameter simply to one or
# whether we use the final alpha of the "non-previous" one.
- self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one
- else self.alphas_cumprod[0])
+ self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
# setable values
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -200,13 +198,11 @@ def scale_model_input(self,
def _get_variance(self, timestep, prev_timestep):
alpha_prod_t = self.alphas_cumprod[timestep]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else self.final_alpha_cumprod)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
- variance = (beta_prod_t_prev / beta_prod_t) * (
- 1 - alpha_prod_t / alpha_prod_t_prev)
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
return variance
@@ -232,8 +228,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -242,11 +237,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
@@ -266,27 +258,28 @@ def set_timesteps(self, num_inference_steps: int):
raise ValueError(
f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
- f" maximal {self.config.num_train_timesteps} timesteps.")
+ f" maximal {self.config.num_train_timesteps} timesteps."
+ )
self.num_inference_steps = num_inference_steps
step_ratio = self.config.num_train_timesteps // self.num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
- timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
- .round()[::-1].copy().astype(np.int64))
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
self.timesteps = paddle.to_tensor(timesteps)
self.timesteps += self.config.steps_offset
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- eta: float=0.0,
- use_clipped_model_output: bool=False,
- generator=None,
- variance_noise: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ eta: float = 0.0,
+ use_clipped_model_output: bool = False,
+ generator=None,
+ variance_noise: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[DDIMSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -330,118 +323,104 @@ def step(
# - pred_prev_sample -> "x_t-1"
# 1. get previous step value (=t-1)
- prev_timestep = (timestep - self.config.num_train_timesteps //
- self.num_inference_steps)
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
# 2. compute alphas, betas
alpha_prod_t = self.alphas_cumprod[timestep]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else self.final_alpha_cumprod)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
# 3. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
if self.config.prediction_type == "epsilon":
- pred_original_sample = (sample - beta_prod_t**
- (0.5) * model_output) / alpha_prod_t**(0.5)
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
pred_epsilon = model_output
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
- pred_epsilon = (sample - alpha_prod_t**
- (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+ pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
elif self.config.prediction_type == "v_prediction":
- pred_original_sample = (alpha_prod_t**0.5) * sample - (
- beta_prod_t**0.5) * model_output
- pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
- 0.5) * sample
+ pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+ pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction`")
+ " `v_prediction`"
+ )
# 4. Clip or threshold "predicted x_0"
if self.config.thresholding:
pred_original_sample = self._threshold_sample(pred_original_sample)
elif self.config.clip_sample:
pred_original_sample = pred_original_sample.clip(
- -self.config.clip_sample_range, self.config.clip_sample_range)
+ -self.config.clip_sample_range, self.config.clip_sample_range
+ )
# 5. compute variance: "sigma_t(η)" -> see formula (16)
# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
variance = self._get_variance(timestep, prev_timestep)
- std_dev_t = eta * variance**(0.5)
+ std_dev_t = eta * variance ** (0.5)
if use_clipped_model_output:
# the pred_epsilon is always re-derived from the clipped x_0 in Glide
- pred_epsilon = (sample - alpha_prod_t**
- (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+ pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
- 0.5) * pred_epsilon
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
# 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- prev_sample = (alpha_prod_t_prev**
- (0.5) * pred_original_sample + pred_sample_direction)
+ prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
if eta > 0:
if variance_noise is not None and generator is not None:
raise ValueError(
"Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
- " `variance_noise` stays `None`.")
+ " `variance_noise` stays `None`."
+ )
if variance_noise is None:
- variance_noise = randn_tensor(
- model_output.shape,
- generator=generator,
- dtype=model_output.dtype)
+ variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
variance = std_dev_t * variance_noise
prev_sample = prev_sample + variance
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
- return DDIMSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(dtype=original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
- def get_velocity(self,
- sample: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor) -> paddle.Tensor:
+ def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as sample
alphas_cumprod = self.alphas_cumprod.cast(dtype=sample.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
index a64c94d782e46..8dfd896087d08 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
@@ -47,8 +47,7 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -119,45 +118,46 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- clip_sample: bool=True,
- set_alpha_to_zero: bool=True,
- steps_offset: int=0,
- prediction_type: str="epsilon",
- clip_sample_range: float=1.0,
- **kwargs, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ clip_sample: bool = True,
+ set_alpha_to_zero: bool = True,
+ steps_offset: int = 0,
+ prediction_type: str = "epsilon",
+ clip_sample_range: float = 1.0,
+ **kwargs,
+ ):
if kwargs.get("set_alpha_to_one", None) is not None:
- deprecation_message = "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
- deprecate(
- "set_alpha_to_one",
- "1.0.0",
- deprecation_message,
- standard_warn=False)
+ deprecation_message = (
+ "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
+ )
+ deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
set_alpha_to_zero = kwargs["set_alpha_to_one"]
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype="float32")
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype="float32")
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype="float32", )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype="float32",
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -167,20 +167,16 @@ def __init__(
# `set_alpha_to_zero` decides whether we set this parameter simply to zero
# in this case, self.step() just output the predicted noise
# or whether we use the final alpha of the "non-previous" one.
- self.final_alpha_cumprod = (paddle.to_tensor(0.0) if set_alpha_to_zero
- else self.alphas_cumprod[-1])
+ self.final_alpha_cumprod = paddle.to_tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
# setable values
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps).copy().astype(np.int64))
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps).copy().astype(np.int64))
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -207,75 +203,73 @@ def set_timesteps(self, num_inference_steps: int):
raise ValueError(
f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
- f" maximal {self.config.num_train_timesteps} timesteps.")
+ f" maximal {self.config.num_train_timesteps} timesteps."
+ )
self.num_inference_steps = num_inference_steps
step_ratio = self.config.num_train_timesteps // self.num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
- timesteps = ((np.arange(0, num_inference_steps) * step_ratio).round()
- .copy().astype(np.int64))
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
self.timesteps = paddle.to_tensor(timesteps)
self.timesteps += self.config.steps_offset
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- eta: float=0.0,
- use_clipped_model_output: bool=False,
- variance_noise: Optional[paddle.Tensor]=None,
- return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ eta: float = 0.0,
+ use_clipped_model_output: bool = False,
+ variance_noise: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[DDIMSchedulerOutput, Tuple]:
# 1. get previous step value (=t+1)
- prev_timestep = (timestep + self.config.num_train_timesteps //
- self.num_inference_steps)
+ prev_timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
# 2. compute alphas, betas
# change original implementation to exactly match noise levels for analogous forward process
alpha_prod_t = self.alphas_cumprod[timestep]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
- if prev_timestep < self.config.num_train_timesteps
- else self.final_alpha_cumprod)
+ alpha_prod_t_prev = (
+ self.alphas_cumprod[prev_timestep]
+ if prev_timestep < self.config.num_train_timesteps
+ else self.final_alpha_cumprod
+ )
beta_prod_t = 1 - alpha_prod_t
# 3. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
if self.config.prediction_type == "epsilon":
- pred_original_sample = (sample - beta_prod_t**
- (0.5) * model_output) / alpha_prod_t**(0.5)
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
pred_epsilon = model_output
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
- pred_epsilon = (sample - alpha_prod_t**
- (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+ pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
elif self.config.prediction_type == "v_prediction":
- pred_original_sample = (alpha_prod_t**0.5) * sample - (
- beta_prod_t**0.5) * model_output
- pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
- 0.5) * sample
+ pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+ pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction`")
+ " `v_prediction`"
+ )
# 4. Clip or threshold "predicted x_0"
if self.config.clip_sample:
pred_original_sample = pred_original_sample.clip(
- -self.config.clip_sample_range, self.config.clip_sample_range)
+ -self.config.clip_sample_range, self.config.clip_sample_range
+ )
# 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- pred_sample_direction = (1 - alpha_prod_t_prev)**(0.5) * pred_epsilon
+ pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
# 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- prev_sample = (alpha_prod_t_prev**
- (0.5) * pred_original_sample + pred_sample_direction)
+ prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
if not return_dict:
return (prev_sample, pred_original_sample)
- return DDIMSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
def __len__(self):
return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
index a3917f57615f8..167ae05b5b169 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -123,31 +123,35 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- variance_type: str="fixed_small",
- clip_sample: bool=True,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- clip_sample_range: float=1.0,
- sample_max_value: float=1.0, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ variance_type: str = "fixed_small",
+ clip_sample: bool = True,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ clip_sample_range: float = 1.0,
+ sample_max_value: float = 1.0,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -156,8 +160,7 @@ def __init__(
betas = paddle.linspace(-6, 6, num_train_timesteps)
self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -169,14 +172,11 @@ def __init__(
# setable values
self.custom_timesteps = False
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps)[::-1].copy())
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
self.variance_type = variance_type
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -191,9 +191,10 @@ def scale_model_input(self,
return sample
def set_timesteps(
- self,
- num_inference_steps: Optional[int]=None,
- timesteps: Optional[List[int]]=None, ):
+ self,
+ num_inference_steps: Optional[int] = None,
+ timesteps: Optional[List[int]] = None,
+ ):
"""
Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -207,20 +208,18 @@ def set_timesteps(
must be `None`.
"""
if num_inference_steps is not None and timesteps is not None:
- raise ValueError(
- "Can only pass one of `num_inference_steps` or `custom_timesteps`."
- )
+ raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
if timesteps is not None:
for i in range(1, len(timesteps)):
if timesteps[i] >= timesteps[i - 1]:
- raise ValueError(
- "`custom_timesteps` must be in descending order.")
+ raise ValueError("`custom_timesteps` must be in descending order.")
if timesteps[0] >= self.config.num_train_timesteps:
raise ValueError(
f"`timesteps` must start before `self.config.train_timesteps`:"
- f" {self.config.num_train_timesteps}.")
+ f" {self.config.num_train_timesteps}."
+ )
timesteps = np.array(timesteps, dtype=np.int64)
self.custom_timesteps = True
@@ -229,11 +228,11 @@ def set_timesteps(
raise ValueError(
f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
- f" maximal {self.config.num_train_timesteps} timesteps.")
+ f" maximal {self.config.num_train_timesteps} timesteps."
+ )
self.num_inference_steps = num_inference_steps
step_ratio = self.config.num_train_timesteps // self.num_inference_steps
- timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
- .round()[::-1].copy().astype(np.int64))
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
self.custom_timesteps = False
self.timesteps = paddle.to_tensor(timesteps)
@@ -242,8 +241,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
prev_t = self.previous_timestep(t)
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = self.alphas_cumprod[
- prev_t] if prev_t >= 0 else self.one
+ alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
# For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
@@ -301,8 +299,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -310,11 +307,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s = paddle.clip(
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
@@ -322,12 +316,13 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
return sample
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- generator=None,
- return_dict: bool=True, ) -> Union[DDPMSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ generator=None,
+ return_dict: bool = True,
+ ) -> Union[DDPMSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -349,19 +344,17 @@ def step(
t = timestep
prev_t = self.previous_timestep(t)
- if model_output.shape[1] == sample.shape[
- 1] * 2 and self.variance_type in [
- "learned",
- "learned_range",
- ]:
+ if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in [
+ "learned",
+ "learned_range",
+ ]:
model_output, predicted_variance = model_output.chunk(2, axis=1)
else:
predicted_variance = None
# 1. compute alphas, betas
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = self.alphas_cumprod[
- prev_t] if prev_t >= 0 else self.one
+ alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
current_alpha_t = alpha_prod_t / alpha_prod_t_prev
@@ -370,17 +363,16 @@ def step(
# 2. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
if self.config.prediction_type == "epsilon":
- pred_original_sample = (sample - beta_prod_t**
- (0.5) * model_output) / alpha_prod_t**(0.5)
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
elif self.config.prediction_type == "v_prediction":
- pred_original_sample = (alpha_prod_t**0.5) * sample - (
- beta_prod_t**0.5) * model_output
+ pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
- " `v_prediction` for the DDPMScheduler.")
+ " `v_prediction` for the DDPMScheduler."
+ )
# 3. Clip or threshold "predicted x_0"
if self.config.thresholding:
@@ -389,84 +381,69 @@ def step(
pred_original_sample = paddle.clip(
pred_original_sample,
-self.config.clip_sample_range,
- self.config.clip_sample_range, )
+ self.config.clip_sample_range,
+ )
# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
- pred_original_sample_coeff = (alpha_prod_t_prev
- **(0.5) * current_beta_t) / beta_prod_t
- current_sample_coeff = current_alpha_t**(
- 0.5) * beta_prod_t_prev / beta_prod_t
+ pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+ current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
# 5. Compute predicted previous sample µ_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
- pred_prev_sample = (pred_original_sample_coeff * pred_original_sample +
- current_sample_coeff * sample)
+ pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
# 6. Add noise
variance = 0
if t > 0:
- variance_noise = randn_tensor(
- model_output.shape,
- generator=generator,
- dtype=model_output.dtype)
+ variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
if self.variance_type == "fixed_small_log":
- variance = (self._get_variance(
- t, predicted_variance=predicted_variance) * variance_noise)
+ variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
elif self.variance_type == "learned_range":
- variance = self._get_variance(
- t, predicted_variance=predicted_variance)
+ variance = self._get_variance(t, predicted_variance=predicted_variance)
variance = paddle.exp(0.5 * variance) * variance_noise
else:
- variance = (self._get_variance(
- t, predicted_variance=predicted_variance)
- **0.5) * variance_noise
+ variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
pred_prev_sample = pred_prev_sample + variance
if not return_dict:
- return (pred_prev_sample, )
+ return (pred_prev_sample,)
- return DDPMSchedulerOutput(
- prev_sample=pred_prev_sample,
- pred_original_sample=pred_original_sample)
+ return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
- def get_velocity(self,
- sample: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor) -> paddle.Tensor:
+ def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(sample.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -485,9 +462,9 @@ def previous_timestep(self, timestep):
else:
prev_t = self.timesteps[index + 1]
else:
- num_inference_steps = (self.num_inference_steps
- if self.num_inference_steps else
- self.config.num_train_timesteps)
+ num_inference_steps = (
+ self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+ )
prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
return prev_t
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
index 845b209a9bc2d..7d4b5802fb447 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
@@ -23,8 +23,7 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -47,7 +46,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -113,38 +112,41 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[np.ndarray]=None,
- solver_order: int=2,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- sample_max_value: float=1.0,
- algorithm_type: str="deis",
- solver_type: str="logrho",
- lower_order_final: bool=True, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ solver_order: int = 2,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "deis",
+ solver_type: str = "logrho",
+ lower_order_final: bool = True,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -161,23 +163,17 @@ def __init__(
if algorithm_type in ["dpmsolver", "dpmsolver++"]:
self.register_to_config(algorithm_type="deis")
else:
- raise NotImplementedError(
- f"{algorithm_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
if solver_type not in ["logrho"]:
if solver_type in ["midpoint", "heun", "bh1", "bh2"]:
self.register_to_config(solver_type="logrho")
else:
- raise NotImplementedError(
- f"solver type {solver_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}")
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=np.float32)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps)
self.model_outputs = [None] * solver_order
self.lower_order_nums = 0
@@ -190,9 +186,12 @@ def set_timesteps(self, num_inference_steps: int):
num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model.
"""
- timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
- num_inference_steps + 1).round()[::-1][:-1]
- .copy().astype(np.int64))
+ timesteps = (
+ np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .copy()
+ .astype(np.int64)
+ )
# when num_inference_steps == num_train_timesteps, we can end up with
# duplicates in timesteps.
@@ -203,7 +202,9 @@ def set_timesteps(self, num_inference_steps: int):
self.num_inference_steps = len(timesteps)
- self.model_outputs = [None, ] * self.config.solver_order
+ self.model_outputs = [
+ None,
+ ] * self.config.solver_order
self.lower_order_nums = 0
def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
@@ -228,8 +229,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -237,21 +237,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s = paddle.clip(
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
return sample
- def convert_model_output(self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor) -> paddle.Tensor:
+ def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
"""
Convert the model output to the corresponding type that the algorithm DEIS needs.
@@ -275,7 +269,8 @@ def convert_model_output(self,
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the DEISMultistepScheduler.")
+ " `v_prediction` for the DEISMultistepScheduler."
+ )
if self.config.thresholding:
x0_pred = self._threshold_sample(x0_pred)
@@ -287,11 +282,12 @@ def convert_model_output(self,
raise NotImplementedError("only support log-rho multistep deis now")
def deis_first_order_update(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the first-order DEIS (equivalent to DDIM).
@@ -305,24 +301,23 @@ def deis_first_order_update(
Returns:
`paddle.Tensor`: the sample tensor at the previous timestep.
"""
- lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
- timestep]
+ lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
sigma_t, _ = self.sigma_t[prev_timestep], self.sigma_t[timestep]
h = lambda_t - lambda_s
if self.config.algorithm_type == "deis":
- x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
- )) * model_output
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
else:
raise NotImplementedError("only support log-rho multistep deis now")
return x_t
def multistep_deis_second_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the second-order multistep DEIS.
@@ -342,28 +337,28 @@ def multistep_deis_second_order_update(
alpha_t, alpha_s0, alpha_s1 = (
self.alpha_t[t],
self.alpha_t[s0],
- self.alpha_t[s1], )
+ self.alpha_t[s1],
+ )
sigma_t, sigma_s0, sigma_s1 = (
self.sigma_t[t],
self.sigma_t[s0],
- self.sigma_t[s1], )
+ self.sigma_t[s1],
+ )
rho_t, rho_s0, rho_s1 = (
sigma_t / alpha_t,
sigma_s0 / alpha_s0,
- sigma_s1 / alpha_s1, )
+ sigma_s1 / alpha_s1,
+ )
if self.config.algorithm_type == "deis":
def ind_fn(t, b, c):
# Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}]
- return (t * (-paddle.log(c) + paddle.log(t) - 1) /
- (paddle.log(b) - paddle.log(c)))
+ return t * (-paddle.log(c) + paddle.log(t) - 1) / (paddle.log(b) - paddle.log(c))
- coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0,
- rho_s1)
- coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1,
- rho_s0)
+ coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1)
+ coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0)
x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1)
return x_t
@@ -371,11 +366,12 @@ def ind_fn(t, b, c):
raise NotImplementedError("only support log-rho multistep deis now")
def multistep_deis_third_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the third-order multistep DEIS.
@@ -394,57 +390,60 @@ def multistep_deis_third_order_update(
prev_timestep,
timestep_list[-1],
timestep_list[-2],
- timestep_list[-3], )
- m0, m1, m2 = model_output_list[-1], model_output_list[
- -2], model_output_list[-3]
+ timestep_list[-3],
+ )
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
alpha_t, alpha_s0, alpha_s1, alpha_s2 = (
self.alpha_t[t],
self.alpha_t[s0],
self.alpha_t[s1],
- self.alpha_t[s2], )
+ self.alpha_t[s2],
+ )
sigma_t, sigma_s0, sigma_s1, simga_s2 = (
self.sigma_t[t],
self.sigma_t[s0],
self.sigma_t[s1],
- self.sigma_t[s2], )
+ self.sigma_t[s2],
+ )
rho_t, rho_s0, rho_s1, rho_s2 = (
sigma_t / alpha_t,
sigma_s0 / alpha_s0,
sigma_s1 / alpha_s1,
- simga_s2 / alpha_s2, )
+ simga_s2 / alpha_s2,
+ )
if self.config.algorithm_type == "deis":
def ind_fn(t, b, c, d):
# Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}]
numerator = t * (
- paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1
- ) - paddle.log(d) * paddle.log(t) +
- paddle.log(d) + paddle.log(t)**2 - 2 * paddle.log(t) + 2)
- denominator = (paddle.log(b) - paddle.log(c)) * (
- paddle.log(b) - paddle.log(d))
+ paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1)
+ - paddle.log(d) * paddle.log(t)
+ + paddle.log(d)
+ + paddle.log(t) ** 2
+ - 2 * paddle.log(t)
+ + 2
+ )
+ denominator = (paddle.log(b) - paddle.log(c)) * (paddle.log(b) - paddle.log(d))
return numerator / denominator
- coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(
- rho_s0, rho_s0, rho_s1, rho_s2)
- coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(
- rho_s0, rho_s1, rho_s2, rho_s0)
- coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(
- rho_s0, rho_s2, rho_s0, rho_s1)
+ coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2)
+ coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0)
+ coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1)
- x_t = alpha_t * (
- sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
+ x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
return x_t
else:
raise NotImplementedError("only support log-rho multistep deis now")
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the multistep DEIS.
@@ -470,29 +469,26 @@ def step(
step_index = len(self.timesteps) - 1
else:
step_index = step_index.item()
- prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
- self.timesteps[step_index + 1])
- lower_order_final = ((step_index == len(self.timesteps) - 1) and
- self.config.lower_order_final and
- len(self.timesteps) < 15)
- lower_order_second = ((step_index == len(self.timesteps) - 2) and
- self.config.lower_order_final and
- len(self.timesteps) < 15)
+ prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+ lower_order_final = (
+ (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+ lower_order_second = (
+ (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
model_output = self.convert_model_output(model_output, timestep, sample)
for i in range(self.config.solver_order - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
self.model_outputs[-1] = model_output
- if (self.config.solver_order == 1 or self.lower_order_nums < 1 or
- lower_order_final):
- prev_sample = self.deis_first_order_update(model_output, timestep,
- prev_timestep, sample)
- elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or
- lower_order_second):
+ if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+ prev_sample = self.deis_first_order_update(model_output, timestep, prev_timestep, sample)
+ elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
timestep_list = [self.timesteps[step_index - 1], timestep]
prev_sample = self.multistep_deis_second_order_update(
- self.model_outputs, timestep_list, prev_timestep, sample)
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
else:
timestep_list = [
self.timesteps[step_index - 2],
@@ -500,18 +496,18 @@ def step(
timestep,
]
prev_sample = self.multistep_deis_third_order_update(
- self.model_outputs, timestep_list, prev_timestep, sample)
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
if self.lower_order_nums < self.config.solver_order:
self.lower_order_nums += 1
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -525,26 +521,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
return sample
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
index 9b360646172d5..5ebc674044afa 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -22,8 +22,7 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -127,39 +126,42 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- solver_order: int=2,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- sample_max_value: float=1.0,
- algorithm_type: str="dpmsolver++",
- solver_type: str="midpoint",
- lower_order_final: bool=True,
- use_karras_sigmas: Optional[bool]=False, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ solver_order: int = 2,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ lower_order_final: bool = True,
+ use_karras_sigmas: Optional[bool] = False,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -176,23 +178,17 @@ def __init__(
if algorithm_type == "deis":
self.register_to_config(algorithm_type="dpmsolver++")
else:
- raise NotImplementedError(
- f"{algorithm_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
if solver_type not in ["midpoint", "heun"]:
if solver_type in ["logrho", "bh1", "bh2"]:
self.register_to_config(solver_type="midpoint")
else:
- raise NotImplementedError(
- f"{solver_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=np.float32)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps)
self.model_outputs = [None] * solver_order
self.lower_order_nums = 0
@@ -206,18 +202,17 @@ def set_timesteps(self, num_inference_steps: int):
num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model.
"""
- timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
- num_inference_steps + 1).round()[::-1][:-1]
- .copy().astype(np.int64))
+ timesteps = (
+ np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .copy()
+ .astype(np.int64)
+ )
if self.use_karras_sigmas:
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)
- **0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
log_sigmas = np.log(sigmas)
- sigmas = self._convert_to_karras(
- in_sigmas=sigmas, num_inference_steps=num_inference_steps)
- timesteps = np.array(
- [self._sigma_to_t(sigma, log_sigmas)
- for sigma in sigmas]).round()
+ sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
timesteps = np.flip(timesteps).copy().astype(np.int64)
# when num_inference_steps == num_train_timesteps, we can end up with
@@ -229,7 +224,9 @@ def set_timesteps(self, num_inference_steps: int):
self.num_inference_steps = len(timesteps)
- self.model_outputs = [None, ] * self.config.solver_order
+ self.model_outputs = [
+ None,
+ ] * self.config.solver_order
self.lower_order_nums = 0
def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
@@ -254,8 +251,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -263,11 +259,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s = paddle.clip(
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
@@ -282,9 +275,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
dists = log_sigma - log_sigmas[:, np.newaxis]
# get sigmas range
- low_idx = (np.cumsum(
- (dists >= 0), axis=0).argmax(axis=0)
- .clip(max=log_sigmas.shape[0] - 2))
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = log_sigmas[low_idx]
@@ -299,8 +290,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
t = t.reshape(sigma.shape)
return t
- def _convert_to_karras(self, in_sigmas: paddle.Tensor,
- num_inference_steps) -> paddle.Tensor:
+ def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
sigma_min = in_sigmas[-1].item()
@@ -308,15 +298,12 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
rho = 7.0 # 7.0 is the value used in the paper
ramp = np.linspace(0, 1, num_inference_steps)
- min_inv_rho = sigma_min**(1 / rho)
- max_inv_rho = sigma_max**(1 / rho)
- sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
return sigmas
- def convert_model_output(self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor) -> paddle.Tensor:
+ def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
"""
Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
@@ -339,19 +326,18 @@ def convert_model_output(self,
# DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type == "dpmsolver++":
if self.config.prediction_type == "epsilon":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
x0_pred = model_output
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = alpha_t * sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the DPMSolverMultistepScheduler.")
+ " `v_prediction` for the DPMSolverMultistepScheduler."
+ )
if self.config.thresholding:
x0_pred = self._threshold_sample(x0_pred)
@@ -362,26 +348,26 @@ def convert_model_output(self,
if self.config.prediction_type == "epsilon":
return model_output
elif self.config.prediction_type == "sample":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = (sample - alpha_t * model_output) / sigma_t
return epsilon
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = alpha_t * model_output + sigma_t * sample
return epsilon
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the DPMSolverMultistepScheduler.")
+ " `v_prediction` for the DPMSolverMultistepScheduler."
+ )
def dpm_solver_first_order_update(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the first-order DPM-Solver (equivalent to DDIM).
@@ -397,25 +383,23 @@ def dpm_solver_first_order_update(
Returns:
`paddle.Tensor`: the sample tensor at the previous timestep.
"""
- lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
- timestep]
+ lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
h = lambda_t - lambda_s
if self.config.algorithm_type == "dpmsolver++":
- x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
- paddle.exp(-h) - 1.0)) * model_output
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
elif self.config.algorithm_type == "dpmsolver":
- x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
- )) * model_output
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
return x_t
def multistep_dpm_solver_second_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the second-order multistep DPM-Solver.
@@ -435,7 +419,8 @@ def multistep_dpm_solver_second_order_update(
lambda_t, lambda_s0, lambda_s1 = (
self.lambda_t[t],
self.lambda_t[s0],
- self.lambda_t[s1], )
+ self.lambda_t[s1],
+ )
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
@@ -444,31 +429,40 @@ def multistep_dpm_solver_second_order_update(
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2211.01095 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((sigma_t / sigma_s0) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
- (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+ )
elif self.config.solver_type == "heun":
- x_t = ((sigma_t / sigma_s0) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1)
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+ )
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((alpha_t / alpha_s0) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 *
- (sigma_t * (paddle.exp(h) - 1.0)) * D1)
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
+ )
elif self.config.solver_type == "heun":
- x_t = ((alpha_t / alpha_s0) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
- (paddle.exp(h) - 1.0) / h - 1.0)) * D1)
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+ )
return x_t
def multistep_dpm_solver_third_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the third-order multistep DPM-Solver.
@@ -487,14 +481,15 @@ def multistep_dpm_solver_third_order_update(
prev_timestep,
timestep_list[-1],
timestep_list[-2],
- timestep_list[-3], )
- m0, m1, m2 = model_output_list[-1], model_output_list[
- -2], model_output_list[-3]
+ timestep_list[-3],
+ )
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
self.lambda_t[t],
self.lambda_t[s0],
self.lambda_t[s1],
- self.lambda_t[s2], )
+ self.lambda_t[s2],
+ )
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
@@ -505,24 +500,29 @@ def multistep_dpm_solver_third_order_update(
D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
- x_t = ((sigma_t / sigma_s0) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
- (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
- x_t = ((alpha_t / alpha_s0) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
- (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * (
- (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+ - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+ )
return x_t
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the multistep DPM-Solver.
@@ -548,29 +548,26 @@ def step(
step_index = len(self.timesteps) - 1
else:
step_index = step_index.item()
- prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
- self.timesteps[step_index + 1])
- lower_order_final = ((step_index == len(self.timesteps) - 1) and
- self.config.lower_order_final and
- len(self.timesteps) < 15)
- lower_order_second = ((step_index == len(self.timesteps) - 2) and
- self.config.lower_order_final and
- len(self.timesteps) < 15)
+ prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+ lower_order_final = (
+ (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+ lower_order_second = (
+ (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
model_output = self.convert_model_output(model_output, timestep, sample)
for i in range(self.config.solver_order - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
self.model_outputs[-1] = model_output
- if (self.config.solver_order == 1 or self.lower_order_nums < 1 or
- lower_order_final):
- prev_sample = self.dpm_solver_first_order_update(
- model_output, timestep, prev_timestep, sample)
- elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or
- lower_order_second):
+ if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+ prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
+ elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
timestep_list = [self.timesteps[step_index - 1], timestep]
prev_sample = self.multistep_dpm_solver_second_order_update(
- self.model_outputs, timestep_list, prev_timestep, sample)
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
else:
timestep_list = [
self.timesteps[step_index - 2],
@@ -578,18 +575,18 @@ def step(
timestep,
]
prev_sample = self.multistep_dpm_solver_third_order_update(
- self.model_outputs, timestep_list, prev_timestep, sample)
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
if self.lower_order_nums < self.config.solver_order:
self.lower_order_nums += 1
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -603,26 +600,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
return sample
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 499d2e90373b9..0e99f01aa230b 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -22,8 +22,7 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -123,38 +122,41 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[np.ndarray]=None,
- solver_order: int=2,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- sample_max_value: float=1.0,
- algorithm_type: str="dpmsolver++",
- solver_type: str="midpoint",
- lower_order_final: bool=True, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ solver_order: int = 2,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ lower_order_final: bool = True,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -171,22 +173,16 @@ def __init__(
if algorithm_type == "deis":
self.register_to_config(algorithm_type="dpmsolver++")
else:
- raise NotImplementedError(
- f"{algorithm_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
if solver_type not in ["midpoint", "heun"]:
if solver_type in ["logrho", "bh1", "bh2"]:
self.register_to_config(solver_type="midpoint")
else:
- raise NotImplementedError(
- f"{solver_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=np.float32)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps)
self.model_outputs = [None] * solver_order
self.sample = None
@@ -248,8 +244,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -257,11 +252,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s = paddle.clip(
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
@@ -277,18 +269,18 @@ def set_timesteps(self, num_inference_steps: int):
the number of diffusion steps used when generating samples with a pre-trained model.
"""
self.num_inference_steps = num_inference_steps
- timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
- num_inference_steps + 1).round()[::-1][:-1]
- .copy().astype(np.int64))
+ timesteps = (
+ np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .copy()
+ .astype(np.int64)
+ )
self.timesteps = paddle.to_tensor(timesteps)
self.model_outputs = [None] * self.config.solver_order
self.sample = None
self.orders = self.get_order_list(num_inference_steps)
- def convert_model_output(self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor) -> paddle.Tensor:
+ def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
"""
Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
@@ -311,19 +303,18 @@ def convert_model_output(self,
# DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type == "dpmsolver++":
if self.config.prediction_type == "epsilon":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
x0_pred = model_output
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = alpha_t * sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the DPMSolverSinglestepScheduler.")
+ " `v_prediction` for the DPMSolverSinglestepScheduler."
+ )
if self.config.thresholding:
x0_pred = self._threshold_sample(x0_pred)
@@ -334,26 +325,26 @@ def convert_model_output(self,
if self.config.prediction_type == "epsilon":
return model_output
elif self.config.prediction_type == "sample":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = (sample - alpha_t * model_output) / sigma_t
return epsilon
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = alpha_t * model_output + sigma_t * sample
return epsilon
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the DPMSolverSinglestepScheduler.")
+ " `v_prediction` for the DPMSolverSinglestepScheduler."
+ )
def dpm_solver_first_order_update(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the first-order DPM-Solver (equivalent to DDIM).
@@ -369,25 +360,23 @@ def dpm_solver_first_order_update(
Returns:
`paddle.Tensor`: the sample tensor at the previous timestep.
"""
- lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
- timestep]
+ lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
h = lambda_t - lambda_s
if self.config.algorithm_type == "dpmsolver++":
- x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
- paddle.exp(-h) - 1.0)) * model_output
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
elif self.config.algorithm_type == "dpmsolver":
- x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
- )) * model_output
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
return x_t
def singlestep_dpm_solver_second_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the second-order singlestep DPM-Solver.
@@ -409,7 +398,8 @@ def singlestep_dpm_solver_second_order_update(
lambda_t, lambda_s0, lambda_s1 = (
self.lambda_t[t],
self.lambda_t[s0],
- self.lambda_t[s1], )
+ self.lambda_t[s1],
+ )
alpha_t, alpha_s1 = self.alpha_t[t], self.alpha_t[s1]
sigma_t, sigma_s1 = self.sigma_t[t], self.sigma_t[s1]
h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
@@ -418,31 +408,40 @@ def singlestep_dpm_solver_second_order_update(
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2211.01095 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((sigma_t / sigma_s1) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
- (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+ x_t = (
+ (sigma_t / sigma_s1) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+ )
elif self.config.solver_type == "heun":
- x_t = ((sigma_t / sigma_s1) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1)
+ x_t = (
+ (sigma_t / sigma_s1) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+ )
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((alpha_t / alpha_s1) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 *
- (sigma_t * (paddle.exp(h) - 1.0)) * D1)
+ x_t = (
+ (alpha_t / alpha_s1) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
+ )
elif self.config.solver_type == "heun":
- x_t = ((alpha_t / alpha_s1) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
- (paddle.exp(h) - 1.0) / h - 1.0)) * D1)
+ x_t = (
+ (alpha_t / alpha_s1) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+ )
return x_t
def singlestep_dpm_solver_third_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the third-order singlestep DPM-Solver.
@@ -463,14 +462,15 @@ def singlestep_dpm_solver_third_order_update(
prev_timestep,
timestep_list[-1],
timestep_list[-2],
- timestep_list[-3], )
- m0, m1, m2 = model_output_list[-1], model_output_list[
- -2], model_output_list[-3]
+ timestep_list[-3],
+ )
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
self.lambda_t[t],
self.lambda_t[s0],
self.lambda_t[s1],
- self.lambda_t[s2], )
+ self.lambda_t[s2],
+ )
alpha_t, alpha_s2 = self.alpha_t[t], self.alpha_t[s2]
sigma_t, sigma_s2 = self.sigma_t[t], self.sigma_t[s2]
h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
@@ -482,35 +482,43 @@ def singlestep_dpm_solver_third_order_update(
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((sigma_t / sigma_s2) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1)
+ x_t = (
+ (sigma_t / sigma_s2) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1
+ )
elif self.config.solver_type == "heun":
x_t = (
- (sigma_t / sigma_s2) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
- (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+ (sigma_t / sigma_s2) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((alpha_t / alpha_s2) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
- (paddle.exp(h) - 1.0) / h - 1.0)) * D1_1)
+ x_t = (
+ (alpha_t / alpha_s2) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1_1
+ )
elif self.config.solver_type == "heun":
- x_t = ((alpha_t / alpha_s2) * sample -
- (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
- (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * (
- (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+ x_t = (
+ (alpha_t / alpha_s2) * sample
+ - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+ - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+ - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+ )
return x_t
def singlestep_dpm_solver_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor,
- order: int, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ order: int,
+ ) -> paddle.Tensor:
"""
One step for the singlestep DPM-Solver.
@@ -528,23 +536,25 @@ def singlestep_dpm_solver_update(
`paddle.Tensor`: the sample tensor at the previous timestep.
"""
if order == 1:
- return self.dpm_solver_first_order_update(
- model_output_list[-1], timestep_list[-1], prev_timestep, sample)
+ return self.dpm_solver_first_order_update(model_output_list[-1], timestep_list[-1], prev_timestep, sample)
elif order == 2:
return self.singlestep_dpm_solver_second_order_update(
- model_output_list, timestep_list, prev_timestep, sample)
+ model_output_list, timestep_list, prev_timestep, sample
+ )
elif order == 3:
return self.singlestep_dpm_solver_third_order_update(
- model_output_list, timestep_list, prev_timestep, sample)
+ model_output_list, timestep_list, prev_timestep, sample
+ )
else:
raise ValueError(f"Order must be 1, 2, 3, got {order}")
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the singlestep DPM-Solver.
@@ -570,8 +580,7 @@ def step(
step_index = len(self.timesteps) - 1
else:
step_index = step_index.item()
- prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
- self.timesteps[step_index + 1])
+ prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
model_output = self.convert_model_output(model_output, timestep, sample)
for i in range(self.config.solver_order - 1):
@@ -583,20 +592,17 @@ def step(
if order == 1:
self.sample = sample
- timestep_list = [
- self.timesteps[step_index - i] for i in range(order - 1, 0, -1)
- ] + [timestep]
+ timestep_list = [self.timesteps[step_index - i] for i in range(order - 1, 0, -1)] + [timestep]
prev_sample = self.singlestep_dpm_solver_update(
- self.model_outputs, timestep_list, prev_timestep, self.sample,
- order)
+ self.model_outputs, timestep_list, prev_timestep, self.sample, order
+ )
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -610,26 +616,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
return sample
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
index 82931a90d6eff..eccdbb7bfdcf4 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
@@ -18,17 +18,14 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
def logaddexp(x, y):
- return paddle.log(1 + paddle.exp(
- paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y)
+ return paddle.log(1 + paddle.exp(paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y)
-def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
- yp: paddle.Tensor) -> paddle.Tensor:
+def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, yp: paddle.Tensor) -> paddle.Tensor:
"""Performs piecewise linear interpolation for x, using xp and yp keypoints (knots).
Performs separate interpolation for each channel.
Args:
@@ -45,8 +42,7 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
>>> calibrate1d(paddle.to_tensor([[-10]]), paddle.to_tensor([[0.0, 1.0]]), paddle.to_tensor([[0.0, 2.0]]))
tensor([[-20.0000]])
"""
- x_breakpoints = paddle.concat(
- [x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2)
+ x_breakpoints = paddle.concat([x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2)
num_x_points = xp.shape[1]
sorted_x_breakpoints = paddle.sort(x_breakpoints, axis=2)
x_indices = paddle.argsort(x_breakpoints, axis=2)
@@ -58,29 +54,26 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
paddle.where(
paddle.equal(x_idx, num_x_points),
paddle.to_tensor([num_x_points - 2]),
- cand_start_idx, ), )
- end_idx = paddle.where(
- paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
- start_x = paddle.take_along_axis(
- arr=sorted_x_breakpoints, axis=2,
- indices=start_idx.unsqueeze(axis=2)).squeeze(axis=2)
- end_x = paddle.take_along_axis(
- arr=sorted_x_breakpoints, axis=2,
- indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2)
+ cand_start_idx,
+ ),
+ )
+ end_idx = paddle.where(paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+ start_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=start_idx.unsqueeze(axis=2)).squeeze(
+ axis=2
+ )
+ end_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2)
start_idx2 = paddle.where(
paddle.equal(x_idx, 0),
paddle.to_tensor([0]),
paddle.where(
paddle.equal(x_idx, num_x_points),
paddle.to_tensor([num_x_points - 2]),
- cand_start_idx, ), )
+ cand_start_idx,
+ ),
+ )
y_positions_expanded = yp.unsqueeze(0).expand([x.shape[0], -1, -1])
- start_y = paddle.take_along_axis(
- y_positions_expanded, axis=2,
- indices=start_idx2.unsqueeze(2)).squeeze(2)
- end_y = paddle.take_along_axis(
- y_positions_expanded, axis=2,
- indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+ start_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=start_idx2.unsqueeze(2)).squeeze(2)
+ end_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
return cand
@@ -128,35 +121,38 @@ class DPMSolverUniDiffuserScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.00085,
- beta_end: float=0.0120,
- method="multistep",
- schedule: str="discrete",
- beta_schedule: str="scaled_linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon",
- algorithm_type: str="dpmsolver++",
- solver_type: str="midpoint", ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.00085,
+ beta_end: float = 0.0120,
+ method="multistep",
+ schedule: str = "discrete",
+ beta_schedule: str = "scaled_linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
if beta_schedule == "scaled_linear":
# this schedule is very specific to the unidiffuser model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
if schedule == "discrete":
log_alphas = 0.5 * paddle.log(1 - self.betas).cumsum(axis=0)
self.total_N = len(log_alphas)
- self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0,
- self.total_N).reshape([1, -1])
+ self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0, self.total_N).reshape([1, -1])
self.log_alpha_discrete = log_alphas.reshape((1, -1))
else:
raise ValueError
@@ -172,16 +168,12 @@ def __init__(
if algorithm_type == "deis":
algorithm_type = "dpmsolver++"
else:
- raise NotImplementedError(
- f"{algorithm_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
if solver_type not in ["midpoint"]:
if solver_type in ["logrho", "bh1", "bh2"]:
solver_type = "midpoint"
else:
- raise NotImplementedError(
- f"{solver_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
@@ -196,7 +188,8 @@ def marginal_log_mean_coeff(self, t):
return interpolate_fn(
t.reshape((-1, 1)),
self.t_discrete.clone(),
- self.log_alpha_discrete.clone(), ).reshape((-1, ))
+ self.log_alpha_discrete.clone(),
+ ).reshape((-1,))
else:
raise ValueError
@@ -207,8 +200,7 @@ def marginal_std(self, t):
"""
Compute sigma_t of a given continuous-time label t in [0, T].
"""
- return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(
- t)))
+ return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(t)))
def marginal_lambda(self, t):
"""
@@ -220,12 +212,13 @@ def marginal_lambda(self, t):
def inverse_lambda(self, lamb):
if self.schedule == "discrete":
- log_alpha = -0.5 * logaddexp(paddle.zeros((1, )), -2.0 * lamb)
+ log_alpha = -0.5 * logaddexp(paddle.zeros((1,)), -2.0 * lamb)
t = interpolate_fn(
log_alpha.reshape((-1, 1)),
paddle.flip(self.log_alpha_discrete.clone(), [1]),
- paddle.flip(self.t_discrete.clone(), [1]), )
- return t.reshape((-1, ))
+ paddle.flip(self.t_discrete.clone(), [1]),
+ )
+ return t.reshape((-1,))
else:
raise ValueError
@@ -243,10 +236,7 @@ def set_timesteps(self, num_inference_steps: int):
self.noise_prev_list = []
self.t_prev_list = []
- def convert_model_output(self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor) -> paddle.Tensor:
+ def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
"""
Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
@@ -267,17 +257,17 @@ def convert_model_output(self,
`paddle.Tensor`: the converted model output.
"""
# DPM-Solver++ needs to solve an integral of the data prediction model.
- alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(
- timestep)
+ alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(timestep)
x0_pred = (sample - sigma_t * model_output) / alpha_t
return x0_pred
def dpm_solver_first_order_update(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the first-order DPM-Solver (equivalent to DDIM).
@@ -293,27 +283,25 @@ def dpm_solver_first_order_update(
Returns:
`paddle.Tensor`: the sample tensor at the previous timestep.
"""
- lambda_t, lambda_s = self.marginal_lambda(
- timestep), self.marginal_lambda(prev_timestep)
+ lambda_t, lambda_s = self.marginal_lambda(timestep), self.marginal_lambda(prev_timestep)
alpha_t = self.marginal_log_mean_coeff(timestep)
- sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(
- prev_timestep)
+ sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(prev_timestep)
alpha_t = paddle.exp(alpha_t)
h = lambda_t - lambda_s
if self.config.algorithm_type == "dpmsolver++":
- x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
- paddle.exp(-h) - 1.0)) * model_output
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
else:
raise ValueError
return x_t
def multistep_dpm_solver_second_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the second-order multistep DPM-Solver.
@@ -333,7 +321,8 @@ def multistep_dpm_solver_second_order_update(
lambda_t, lambda_s0, lambda_s1 = (
self.marginal_lambda(t),
self.marginal_lambda(s0),
- self.marginal_lambda(s1), )
+ self.marginal_lambda(s1),
+ )
log_alpha_t = self.marginal_log_mean_coeff(t)
sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
@@ -343,19 +332,22 @@ def multistep_dpm_solver_second_order_update(
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2211.01095 for detailed derivations
if self.config.solver_type == "midpoint":
- x_t = ((sigma_t / sigma_s0) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
- (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+ )
else:
raise ValueError
return x_t
def multistep_dpm_solver_third_order_update(
- self,
- model_output_list: List[paddle.Tensor],
- timestep_list: List[int],
- prev_timestep: int,
- sample: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ model_output_list: List[paddle.Tensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ ) -> paddle.Tensor:
"""
One step for the third-order multistep DPM-Solver.
@@ -374,14 +366,15 @@ def multistep_dpm_solver_third_order_update(
prev_timestep,
timestep_list[-1],
timestep_list[-2],
- timestep_list[-3], )
- m0, m1, m2 = model_output_list[-1], model_output_list[
- -2], model_output_list[-3]
+ timestep_list[-3],
+ )
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
self.marginal_lambda(t),
self.marginal_lambda(s0),
self.marginal_lambda(s1),
- self.marginal_lambda(s2), )
+ self.marginal_lambda(s2),
+ )
alpha_t = self.marginal_log_mean_coeff(t)
alpha_t = paddle.exp(alpha_t)
sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
@@ -393,20 +386,23 @@ def multistep_dpm_solver_third_order_update(
D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
- x_t = ((sigma_t / sigma_s0) * sample -
- (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
- (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
- (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
else:
raise ValueError
return x_t
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the multistep DPM-Solver.
@@ -437,59 +433,47 @@ def step(
if self.method == "multistep":
if step_index == 0:
vec_t = timestep.expand([sample.shape[0]])
- model_output = self.convert_model_output(model_output, vec_t,
- sample)
+ model_output = self.convert_model_output(model_output, vec_t, sample)
self.noise_prev_list.append(model_output)
self.t_prev_list.append(vec_t)
if step_index > 0 and step_index < order:
vec_t = timestep.expand([sample.shape[0]])
- sample = self.dpm_multistep_update(sample, self.noise_prev_list,
- self.t_prev_list, vec_t,
- step_index)
- model_output = self.convert_model_output(model_output, vec_t,
- sample)
+ sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, step_index)
+ model_output = self.convert_model_output(model_output, vec_t, sample)
self.noise_prev_list.append(model_output)
self.t_prev_list.append(vec_t)
if step_index >= order and step_index < len(self.timesteps):
vec_t = timestep.expand([sample.shape[0]])
- sample = self.dpm_multistep_update(sample, self.noise_prev_list,
- self.t_prev_list, vec_t,
- order)
+ sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, order)
for i in range(order - 1):
self.t_prev_list[i] = self.t_prev_list[i + 1]
self.noise_prev_list[i] = self.noise_prev_list[i + 1]
self.t_prev_list[-1] = vec_t
if step_index < len(self.timesteps) - 1:
- self.noise_prev_list[-1] = self.convert_model_output(
- model_output, vec_t, sample)
+ self.noise_prev_list[-1] = self.convert_model_output(model_output, vec_t, sample)
else:
raise ValueError
prev_sample = sample
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def dpm_multistep_update(self, x, noise_prev_list, t_prev_list, t, order):
if order == 1:
- return self.dpm_solver_first_order_update(noise_prev_list[-1], t,
- t_prev_list[-1], x)
+ return self.dpm_solver_first_order_update(noise_prev_list[-1], t, t_prev_list[-1], x)
elif order == 2:
- return self.multistep_dpm_solver_second_order_update(
- noise_prev_list, t_prev_list, t, x)
+ return self.multistep_dpm_solver_second_order_update(noise_prev_list, t_prev_list, t, x)
elif order == 3:
- return self.multistep_dpm_solver_third_order_update(
- noise_prev_list, t_prev_list, t, x)
+ return self.multistep_dpm_solver_third_order_update(noise_prev_list, t_prev_list, t, x)
else:
- raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(
- order))
+ raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
index a2a0a495031de..95332c844b137 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -47,8 +47,7 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -108,37 +107,39 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon", ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -147,15 +148,11 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
self.is_scale_input_called = False
- def scale_model_input(
- self, sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
"""
Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
@@ -168,7 +165,7 @@ def scale_model_input(
"""
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
self.is_scale_input_called = True
return sample
@@ -182,27 +179,21 @@ def set_timesteps(self, num_inference_steps: int):
"""
self.num_inference_steps = num_inference_steps
- timesteps = np.linspace(
- 0,
- self.config.num_train_timesteps - 1,
- num_inference_steps,
- dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
def step(
- self,
- model_output: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- sample: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[
- EulerAncestralDiscreteSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -224,7 +215,8 @@ def step(
if not self.is_scale_input_called:
logger.warning(
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
- "See `StableDiffusionPipeline` for a usage example.")
+ "See `StableDiffusionPipeline` for a usage example."
+ )
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
@@ -233,11 +225,9 @@ def step(
pred_original_sample = sample - sigma * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
- pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
- ) + (sample / (sigma**2 + 1))
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
elif self.config.prediction_type == "sample":
- raise NotImplementedError(
- "prediction_type not implemented yet: sample")
+ raise NotImplementedError("prediction_type not implemented yet: sample")
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -245,9 +235,8 @@ def step(
sigma_from = self.sigmas[step_index]
sigma_to = self.sigmas[step_index + 1]
- sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from
- **2)**0.5
- sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
# 2. Convert to an ODE derivative
derivative = (sample - pred_original_sample) / sigma
@@ -256,28 +245,28 @@ def step(
prev_sample = sample + derivative * dt
- noise = randn_tensor(
- model_output.shape, dtype=model_output.dtype, generator=generator)
+ noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
prev_sample = prev_sample + noise * sigma_up
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return EulerAncestralDiscreteSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ prev_sample=prev_sample, pred_original_sample=pred_original_sample
+ )
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [(schedule_timesteps == t).nonzero().item()
- for t in timesteps]
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
index 8d53b8dd4f3a9..a45e3bf0e5617 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
@@ -66,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -114,39 +114,41 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon",
- interpolation_type: str="linear",
- use_karras_sigmas: Optional[bool]=False, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ interpolation_type: str = "linear",
+ use_karras_sigmas: Optional[bool] = False,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -155,16 +157,12 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
self.is_scale_input_called = False
self.use_karras_sigmas = use_karras_sigmas
- def scale_model_input(
- self, sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
"""
Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
@@ -178,7 +176,7 @@ def scale_model_input(
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
self.is_scale_input_called = True
return sample
@@ -193,31 +191,23 @@ def set_timesteps(self, num_inference_steps: int):
"""
self.num_inference_steps = num_inference_steps
- timesteps = np.linspace(
- 0,
- self.config.num_train_timesteps - 1,
- num_inference_steps,
- dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
log_sigmas = np.log(sigmas)
if self.config.interpolation_type == "linear":
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
elif self.config.interpolation_type == "log_linear":
- sigmas = paddle.linspace(
- np.log(sigmas[-1]), np.log(sigmas[0]),
- num_inference_steps + 1).exp()
+ sigmas = paddle.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp()
else:
raise ValueError(
f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
- " 'linear' or 'log_linear'")
+ " 'linear' or 'log_linear'"
+ )
if self.use_karras_sigmas:
- sigmas = self._convert_to_karras(
- in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
- timesteps = np.array(
- [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+ sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -231,9 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
dists = log_sigma - log_sigmas[:, np.newaxis]
# get sigmas range
- low_idx = (np.cumsum(
- (dists >= 0), axis=0).argmax(axis=0)
- .clip(max=log_sigmas.shape[0] - 2))
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = log_sigmas[low_idx]
@@ -248,8 +236,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
t = t.reshape(sigma.shape)
return t
- def _convert_to_karras(self, in_sigmas: paddle.Tensor,
- num_inference_steps) -> paddle.Tensor:
+ def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
sigma_min = in_sigmas[-1].item()
@@ -257,24 +244,23 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
rho = 7.0 # 7.0 is the value used in the paper
ramp = np.linspace(0, 1, num_inference_steps)
- min_inv_rho = sigma_min**(1 / rho)
- max_inv_rho = sigma_max**(1 / rho)
- sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
return sigmas
def step(
- self,
- model_output: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- sample: paddle.Tensor,
- s_churn: float=0.0,
- s_tmin: float=0.0,
- s_tmax: float=float("inf"),
- s_noise: float=1.0,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[EulerDiscreteSchedulerOutput,
- Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ s_churn: float = 0.0,
+ s_tmin: float = 0.0,
+ s_tmax: float = float("inf"),
+ s_noise: float = 1.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -301,35 +287,32 @@ def step(
if not self.is_scale_input_called:
logger.warning(
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
- "See `StableDiffusionPipeline` for a usage example.")
+ "See `StableDiffusionPipeline` for a usage example."
+ )
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
- gamma = (min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1)
- if s_tmin <= sigma <= s_tmax else 0.0)
+ gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
- noise = randn_tensor(
- model_output.shape, dtype=model_output.dtype, generator=generator)
+ noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
eps = noise * s_noise
sigma_hat = sigma * (gamma + 1)
if gamma > 0:
- sample = sample + eps * (sigma_hat**2 - sigma**2)**0.5
+ sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
# NOTE: "original_sample" should not be an expected prediction_type but is left in for
# backwards compatibility
- if (self.config.prediction_type == "original_sample" or
- self.config.prediction_type == "sample"):
+ if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
pred_original_sample = model_output
elif self.config.prediction_type == "epsilon":
pred_original_sample = sample - sigma_hat * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
- pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
- ) + (sample / (sigma**2 + 1))
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -343,22 +326,21 @@ def step(
prev_sample = sample + derivative * dt
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
- return EulerDiscreteSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [(schedule_timesteps == t).nonzero().item()
- for t in timesteps]
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
index 4cd27a38164ff..05a8673a2a358 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
@@ -20,13 +20,11 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -90,32 +88,35 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.00085, # sensible defaults
- beta_end: float=0.012,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon",
- use_karras_sigmas: Optional[bool]=False, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.00085, # sensible defaults
+ beta_end: float = 0.012,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ use_karras_sigmas: Optional[bool] = False,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -137,9 +138,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
return indices[pos].item()
def scale_model_input(
- self,
- sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ ) -> paddle.Tensor:
"""
Args:
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -151,13 +153,14 @@ def scale_model_input(
step_index = self.index_for_timestep(timestep)
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
return sample
def set_timesteps(
- self,
- num_inference_steps: int,
- num_train_timesteps: Optional[int]=None, ):
+ self,
+ num_inference_steps: int,
+ num_train_timesteps: Optional[int] = None,
+ ):
"""
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -169,32 +172,25 @@ def set_timesteps(
num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_inference_steps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
log_sigmas = np.log(sigmas)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
if self.use_karras_sigmas:
- sigmas = self._convert_to_karras(
- in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
- timesteps = np.array(
- [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+ sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
sigmas = paddle.to_tensor(sigmas)
- self.sigmas = paddle.concat(
- [sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+ self.sigmas = paddle.concat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
# standard deviation of the initial noise distribution
self.init_noise_sigma = self.sigmas.max()
timesteps = paddle.to_tensor(timesteps)
- timesteps = paddle.concat(
- [timesteps[:1], timesteps[1:].repeat_interleave(2)])
+ timesteps = paddle.concat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
self.timesteps = timesteps.cast(paddle.float32)
@@ -210,9 +206,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
dists = log_sigma - log_sigmas[:, np.newaxis]
# get sigmas range
- low_idx = (np.cumsum(
- (dists >= 0), axis=0).argmax(axis=0)
- .clip(max=log_sigmas.shape[0] - 2))
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = log_sigmas[low_idx]
@@ -227,8 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
t = t.reshape(sigma.shape)
return t
- def _convert_to_karras(self, in_sigmas: paddle.Tensor,
- num_inference_steps) -> paddle.Tensor:
+ def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
sigma_min = in_sigmas[-1].item()
@@ -236,9 +229,9 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
rho = 7.0 # 7.0 is the value used in the paper
ramp = np.linspace(0, 1, num_inference_steps)
- min_inv_rho = sigma_min**(1 / rho)
- max_inv_rho = sigma_max**(1 / rho)
- sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
return sigmas
@property
@@ -246,11 +239,12 @@ def state_in_first_order(self):
return self.dt is None
def step(
- self,
- model_output: Union[paddle.Tensor, np.ndarray],
- timestep: Union[float, paddle.Tensor],
- sample: Union[paddle.Tensor, np.ndarray],
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: Union[paddle.Tensor, np.ndarray],
+ timestep: Union[float, paddle.Tensor],
+ sample: Union[paddle.Tensor, np.ndarray],
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Args:
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -286,11 +280,11 @@ def step(
pred_original_sample = sample - sigma_input * model_output
elif self.config.prediction_type == "v_prediction":
sigma_input = sigma_hat if self.state_in_first_order else sigma_next
- pred_original_sample = model_output * (-sigma_input / (
- sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+ pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+ sample / (sigma_input**2 + 1)
+ )
elif self.config.prediction_type == "sample":
- raise NotImplementedError(
- "prediction_type not implemented yet: sample")
+ raise NotImplementedError("prediction_type not implemented yet: sample")
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -324,22 +318,21 @@ def step(
prev_sample = sample + derivative * dt
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [
- self.index_for_timestep(t, schedule_timesteps) for t in timesteps
- ]
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
index 4d2b87c82ae86..8b8595755cb61 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
@@ -43,9 +43,10 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None, ):
+ self,
+ num_train_timesteps: int = 1000,
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ ):
# set `betas`, `alphas`, `timesteps`
self.set_timesteps(num_train_timesteps)
@@ -73,24 +74,23 @@ def set_timesteps(self, num_inference_steps: int):
steps = paddle.concat([steps, paddle.to_tensor([0.0])])
if self.config.trained_betas is not None:
- self.betas = paddle.to_tensor(
- self.config.trained_betas, dtype=paddle.float32)
+ self.betas = paddle.to_tensor(self.config.trained_betas, dtype=paddle.float32)
else:
- self.betas = paddle.sin(steps * math.pi / 2)**2
+ self.betas = paddle.sin(steps * math.pi / 2) ** 2
- self.alphas = (1.0 - self.betas**2)**0.5
+ self.alphas = (1.0 - self.betas**2) ** 0.5
- self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi *
- 2)[:-1]
+ self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
self.ets = []
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
times to approximate the solution.
@@ -119,8 +119,7 @@ def step(
timestep_index = (self.timesteps == timestep).nonzero().item()
prev_timestep_index = timestep_index + 1
- ets = (sample * self.betas[timestep_index] + model_output *
- self.alphas[timestep_index])
+ ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
self.ets.append(ets)
if len(self.ets) == 1:
@@ -128,22 +127,18 @@ def step(
elif len(self.ets) == 2:
ets = (3 * self.ets[-1] - self.ets[-2]) / 2
elif len(self.ets) == 3:
- ets = (
- 23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+ ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
else:
- ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 *
- self.ets[-3] - 9 * self.ets[-4])
+ ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
- prev_sample = self._get_prev_sample(sample, timestep_index,
- prev_timestep_index, ets)
+ prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -156,8 +151,7 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
"""
return sample
- def _get_prev_sample(self, sample, timestep_index, prev_timestep_index,
- ets):
+ def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
alpha = self.alphas[timestep_index]
sigma = self.betas[timestep_index]
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 937c161348c12..9857a57444941 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -21,13 +21,11 @@
from ..configuration_utils import ConfigMixin, register_to_config
from ..utils import randn_tensor
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -46,7 +44,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -88,31 +86,34 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.00085, # sensible defaults
- beta_end: float=0.012,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon", ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.00085, # sensible defaults
+ beta_end: float = 0.012,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -133,9 +134,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
return indices[pos].item()
def scale_model_input(
- self,
- sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ ) -> paddle.Tensor:
"""
Args:
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -151,13 +153,14 @@ def scale_model_input(
else:
sigma = self.sigmas_interpol[step_index - 1]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
return sample
def set_timesteps(
- self,
- num_inference_steps: int,
- num_train_timesteps: Optional[int]=None, ):
+ self,
+ num_inference_steps: int,
+ num_train_timesteps: Optional[int] = None,
+ ):
"""
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -169,12 +172,9 @@ def set_timesteps(
num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_inference_steps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -184,9 +184,8 @@ def set_timesteps(
# compute up and down sigmas
sigmas_next = sigmas.roll(-1)
sigmas_next[-1] = 0.0
- sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas
- **2)**0.5
- sigmas_down = (sigmas_next**2 - sigmas_up**2)**0.5
+ sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
+ sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
sigmas_down[-1] = 0.0
# compute interpolated sigmas
@@ -194,20 +193,16 @@ def set_timesteps(
sigmas_interpol[-2:] = 0.0
# set sigmas
- self.sigmas = paddle.concat(
- [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
- self.sigmas_interpol = paddle.concat([
- sigmas_interpol[:1],
- sigmas_interpol[1:].repeat_interleave(2),
- sigmas_interpol[-1:],
- ])
- self.sigmas_up = paddle.concat([
- sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]
- ])
- self.sigmas_down = paddle.concat([
- sigmas_down[:1], sigmas_down[1:].repeat_interleave(2),
- sigmas_down[-1:]
- ])
+ self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+ self.sigmas_interpol = paddle.concat(
+ [
+ sigmas_interpol[:1],
+ sigmas_interpol[1:].repeat_interleave(2),
+ sigmas_interpol[-1:],
+ ]
+ )
+ self.sigmas_up = paddle.concat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
+ self.sigmas_down = paddle.concat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
# standard deviation of the initial noise distribution
self.init_noise_sigma = self.sigmas.max()
@@ -215,12 +210,9 @@ def set_timesteps(
timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
timesteps_interpol = self.sigma_to_t(sigmas_interpol)
- timesteps_interpol = paddle.cast(
- timesteps_interpol, dtype=timesteps.dtype)
+ timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
- interleaved_timesteps = paddle.stack(
- (timesteps_interpol[:-2, None], timesteps[1:, None]),
- axis=-1).flatten()
+ interleaved_timesteps = paddle.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), axis=-1).flatten()
self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
@@ -234,8 +226,7 @@ def sigma_to_t(self, sigma):
dists = log_sigma - self.log_sigmas[:, None]
# get sigmas range
- low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0)
- .clip(max=self.log_sigmas.shape[0] - 2))
+ low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = self.log_sigmas[low_idx]
@@ -255,13 +246,13 @@ def state_in_first_order(self):
return self.sample is None
def step(
- self,
- model_output: Union[paddle.Tensor, np.ndarray],
- timestep: Union[float, paddle.Tensor],
- sample: Union[paddle.Tensor, np.ndarray],
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: Union[paddle.Tensor, np.ndarray],
+ timestep: Union[float, paddle.Tensor],
+ sample: Union[paddle.Tensor, np.ndarray],
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Args:
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -295,8 +286,7 @@ def step(
gamma = 0
sigma_hat = sigma * (gamma + 1) # Note: sigma_hat == sigma for now
- noise = randn_tensor(
- model_output.shape, dtype=model_output.dtype, generator=generator)
+ noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
if self.config.prediction_type == "epsilon":
@@ -304,11 +294,11 @@ def step(
pred_original_sample = sample - sigma_input * model_output
elif self.config.prediction_type == "v_prediction":
sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
- pred_original_sample = model_output * (-sigma_input / (
- sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+ pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+ sample / (sigma_input**2 + 1)
+ )
elif self.config.prediction_type == "sample":
- raise NotImplementedError(
- "prediction_type not implemented yet: sample")
+ raise NotImplementedError("prediction_type not implemented yet: sample")
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -338,22 +328,21 @@ def step(
prev_sample = prev_sample + noise * sigma_up
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [
- self.index_for_timestep(t, schedule_timesteps) for t in timesteps
- ]
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
index b6df7c60c3000..87790b6ece926 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -20,13 +20,11 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
- max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -87,31 +85,34 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.00085, # sensible defaults
- beta_end: float=0.012,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon", ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.00085, # sensible defaults
+ beta_end: float = 0.012,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -132,9 +133,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
return indices[pos].item()
def scale_model_input(
- self,
- sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ ) -> paddle.Tensor:
"""
Args:
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -150,13 +152,14 @@ def scale_model_input(
else:
sigma = self.sigmas_interpol[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
return sample
def set_timesteps(
- self,
- num_inference_steps: int,
- num_train_timesteps: Optional[int]=None, ):
+ self,
+ num_inference_steps: int,
+ num_train_timesteps: Optional[int] = None,
+ ):
"""
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -168,12 +171,9 @@ def set_timesteps(
num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_inference_steps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -185,13 +185,14 @@ def set_timesteps(
# must set to 0.0
sigmas_interpol[-1] = 0.0
- self.sigmas = paddle.concat(
- [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
- self.sigmas_interpol = paddle.concat([
- sigmas_interpol[:1],
- sigmas_interpol[1:].repeat_interleave(2),
- sigmas_interpol[-1:],
- ])
+ self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+ self.sigmas_interpol = paddle.concat(
+ [
+ sigmas_interpol[:1],
+ sigmas_interpol[1:].repeat_interleave(2),
+ sigmas_interpol[-1:],
+ ]
+ )
# standard deviation of the initial noise distribution
self.init_noise_sigma = self.sigmas.max()
@@ -199,11 +200,8 @@ def set_timesteps(
timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
# interpolate timesteps
timesteps_interpol = self.sigma_to_t(sigmas_interpol)
- timesteps_interpol = paddle.cast(
- timesteps_interpol, dtype=timesteps.dtype)
- interleaved_timesteps = paddle.stack(
- (timesteps_interpol[1:-1, None], timesteps[1:, None]),
- axis=-1).flatten()
+ timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
+ interleaved_timesteps = paddle.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), axis=-1).flatten()
self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
@@ -217,8 +215,7 @@ def sigma_to_t(self, sigma):
dists = log_sigma - self.log_sigmas[:, None]
# get sigmas range
- low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0)
- .clip(max=self.log_sigmas.shape[0] - 2))
+ low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = self.log_sigmas[low_idx]
@@ -238,11 +235,12 @@ def state_in_first_order(self):
return self.sample is None
def step(
- self,
- model_output: Union[paddle.Tensor, np.ndarray],
- timestep: Union[float, paddle.Tensor],
- sample: Union[paddle.Tensor, np.ndarray],
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: Union[paddle.Tensor, np.ndarray],
+ timestep: Union[float, paddle.Tensor],
+ sample: Union[paddle.Tensor, np.ndarray],
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Args:
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -280,11 +278,11 @@ def step(
pred_original_sample = sample - sigma_input * model_output
elif self.config.prediction_type == "v_prediction":
sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
- pred_original_sample = model_output * (-sigma_input / (
- sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+ pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+ sample / (sigma_input**2 + 1)
+ )
elif self.config.prediction_type == "sample":
- raise NotImplementedError(
- "prediction_type not implemented yet: sample")
+ raise NotImplementedError("prediction_type not implemented yet: sample")
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -312,22 +310,21 @@ def step(
prev_sample = sample + derivative * dt
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [
- self.index_for_timestep(t, schedule_timesteps) for t in timesteps
- ]
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
index ba4bf176efd6c..f104b1a69a8d9 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
@@ -81,13 +81,14 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- sigma_min: float=0.02,
- sigma_max: float=100,
- s_noise: float=1.007,
- s_churn: float=80,
- s_min: float=0.05,
- s_max: float=50, ):
+ self,
+ sigma_min: float = 0.02,
+ sigma_max: float = 100,
+ s_noise: float = 1.007,
+ s_churn: float = 80,
+ s_min: float = 0.05,
+ s_max: float = 50,
+ ):
# standard deviation of the initial noise distribution
self.init_noise_sigma = sigma_max
@@ -96,9 +97,7 @@ def __init__(
self.timesteps: paddle.Tensor = None
self.schedule: paddle.Tensor = None # sigma(t_i)
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -124,17 +123,21 @@ def set_timesteps(self, num_inference_steps: int):
self.num_inference_steps = num_inference_steps
timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps)
- schedule = [(self.config.sigma_max
- **2 * (self.config.sigma_min**2 / self.config.sigma_max**2)
- **(i / (num_inference_steps - 1))) for i in self.timesteps]
+ schedule = [
+ (
+ self.config.sigma_max**2
+ * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+ )
+ for i in self.timesteps
+ ]
self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32)
def add_noise_to_input(
- self,
- sample: paddle.Tensor,
- sigma: float,
- generator: Optional[paddle.Generator]=None, ) -> Tuple[
- paddle.Tensor, float]:
+ self,
+ sample: paddle.Tensor,
+ sigma: float,
+ generator: Optional[paddle.Generator] = None,
+ ) -> Tuple[paddle.Tensor, float]:
"""
Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
@@ -142,26 +145,25 @@ def add_noise_to_input(
TODO Args:
"""
if self.config.s_min <= sigma <= self.config.s_max:
- gamma = min(self.config.s_churn / self.num_inference_steps,
- 2**0.5 - 1)
+ gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
else:
gamma = 0
# sample eps ~ N(0, S_noise^2 * I)
- eps = self.config.s_noise * randn_tensor(
- sample.shape, generator=generator)
+ eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator)
sigma_hat = sigma + gamma * sigma
- sample_hat = sample + ((sigma_hat**2 - sigma**2)**0.5 * eps)
+ sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
return sample_hat, sigma_hat
def step(
- self,
- model_output: paddle.Tensor,
- sigma_hat: float,
- sigma_prev: float,
- sample_hat: paddle.Tensor,
- return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -191,17 +193,19 @@ def step(
return KarrasVeOutput(
prev_sample=sample_prev,
derivative=derivative,
- pred_original_sample=pred_original_sample, )
+ pred_original_sample=pred_original_sample,
+ )
def step_correct(
- self,
- model_output: paddle.Tensor,
- sigma_hat: float,
- sigma_prev: float,
- sample_hat: paddle.Tensor,
- sample_prev: paddle.Tensor,
- derivative: paddle.Tensor,
- return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: paddle.Tensor,
+ sample_prev: paddle.Tensor,
+ derivative: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
"""
Correct the predicted sample based on the output model_output of the network. TODO complete description
@@ -220,8 +224,7 @@ def step_correct(
"""
pred_original_sample = sample_prev + sigma_prev * model_output
derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
- sample_prev = sample_hat + (sigma_prev - sigma_hat) * (
- 0.5 * derivative + 0.5 * derivative_corr)
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
if not return_dict:
return (sample_prev, derivative)
@@ -229,7 +232,8 @@ def step_correct(
return KarrasVeOutput(
prev_sample=sample_prev,
derivative=derivative,
- pred_original_sample=pred_original_sample, )
+ pred_original_sample=pred_original_sample,
+ )
def add_noise(self, original_samples, noise, timesteps):
raise NotImplementedError()
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
index 872f3891e0cf4..122b5e8dffa7d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -106,37 +106,39 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- prediction_type: str="epsilon", ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -145,16 +147,12 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=float)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
self.derivatives = []
self.is_scale_input_called = False
- def scale_model_input(
- self, sample: paddle.Tensor,
- timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
"""
Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
@@ -167,7 +165,7 @@ def scale_model_input(
"""
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
- sample = sample / ((sigma**2 + 1)**0.5)
+ sample = sample / ((sigma**2 + 1) ** 0.5)
self.is_scale_input_called = True
return sample
@@ -186,12 +184,10 @@ def lms_derivative(tau):
for k in range(order):
if current_order == k:
continue
- prod *= (tau - self.sigmas[t - k]) / (
- self.sigmas[t - current_order] - self.sigmas[t - k])
+ prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
return prod
- integrated_coeff = integrate.quad(
- lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+ integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
return integrated_coeff
@@ -205,13 +201,8 @@ def set_timesteps(self, num_inference_steps: int):
"""
self.num_inference_steps = num_inference_steps
- timesteps = np.linspace(
- 0,
- self.config.num_train_timesteps - 1,
- num_inference_steps,
- dtype=float)[::-1].copy()
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
- 0.5)
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = paddle.to_tensor(sigmas)
@@ -220,13 +211,13 @@ def set_timesteps(self, num_inference_steps: int):
self.derivatives = []
def step(
- self,
- model_output: paddle.Tensor,
- timestep: Union[float, paddle.Tensor],
- sample: paddle.Tensor,
- order: int=4,
- return_dict: bool=True, ) -> Union[LMSDiscreteSchedulerOutput,
- Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ order: int = 4,
+ return_dict: bool = True,
+ ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -248,7 +239,8 @@ def step(
if not self.is_scale_input_called:
warnings.warn(
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
- "See `StableDiffusionPipeline` for a usage example.")
+ "See `StableDiffusionPipeline` for a usage example."
+ )
step_index = (self.timesteps == timestep).nonzero().item()
sigma = self.sigmas[step_index]
@@ -258,8 +250,7 @@ def step(
pred_original_sample = sample - sigma * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
- pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
- ) + (sample / (sigma**2 + 1))
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
else:
@@ -275,33 +266,29 @@ def step(
# 3. Compute linear multistep coefficients
order = min(step_index + 1, order)
- lms_coeffs = [
- self.get_lms_coefficient(order, step_index, curr_order)
- for curr_order in range(order)
- ]
+ lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
# 4. Compute previous sample based on the derivatives path
- prev_sample = sample + sum(coeff * derivative
- for coeff, derivative in zip(
- lms_coeffs, reversed(self.derivatives)))
+ prev_sample = sample + sum(
+ coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+ )
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
- return LMSDiscreteSchedulerOutput(
- prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+ return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.sigmas.cast(original_samples.dtype)
schedule_timesteps = self.timesteps
- step_indices = [(schedule_timesteps == t).nonzero().item()
- for t in timesteps]
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
sigma = sigmas[step_indices].flatten()
while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
index 437f108e73af3..c821dae87d35d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
@@ -22,8 +22,7 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -99,40 +98,42 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- skip_prk_steps: bool=False,
- set_alpha_to_one: bool=False,
- prediction_type: str="epsilon",
- steps_offset: int=0, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ skip_prk_steps: bool = False,
+ set_alpha_to_one: bool = False,
+ prediction_type: str = "epsilon",
+ steps_offset: int = 0,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
- self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one
- else self.alphas_cumprod[0])
+ self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
@@ -168,8 +169,7 @@ def set_timesteps(self, num_inference_steps: int):
step_ratio = self.config.num_train_timesteps // self.num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
- self._timesteps = (np.arange(0, num_inference_steps) *
- step_ratio).round()
+ self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
self._timesteps += self.config.steps_offset
if self.config.skip_prk_steps:
@@ -177,25 +177,20 @@ def set_timesteps(self, num_inference_steps: int):
# produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
# is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
self.prk_timesteps = np.array([])
- self.plms_timesteps = np.concatenate([
- self._timesteps[:-1], self._timesteps[-2:-1],
- self._timesteps[-1:]
- ])[::-1].copy()
+ self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
+ ::-1
+ ].copy()
else:
- prk_timesteps = np.array(self._timesteps[-self.pndm_order:]).repeat(
- 2) + np.tile(
- np.array([
- 0, self.config.num_train_timesteps //
- num_inference_steps // 2
- ]),
- self.pndm_order, )
- self.prk_timesteps = (
- prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
- self.plms_timesteps = self._timesteps[:-3][::-1].copy(
- ) # we copy to avoid having negative strides which are not supported by paddle
-
- timesteps = np.concatenate(
- [self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+ prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
+ np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]),
+ self.pndm_order,
+ )
+ self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+ self.plms_timesteps = self._timesteps[:-3][
+ ::-1
+ ].copy() # we copy to avoid having negative strides which are not supported by paddle
+
+ timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
self.timesteps = paddle.to_tensor(timesteps)
self.ets = []
@@ -203,11 +198,12 @@ def set_timesteps(self, num_inference_steps: int):
self.cur_model_output = 0
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -227,26 +223,28 @@ def step(
returning a tuple, the first element is the sample tensor.
"""
- if self.counter < len(
- self.prk_timesteps) and not self.config.skip_prk_steps:
+ if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
return self.step_prk(
model_output=model_output,
timestep=timestep,
sample=sample,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
else:
return self.step_plms(
model_output=model_output,
timestep=timestep,
sample=sample,
- return_dict=return_dict, )
+ return_dict=return_dict,
+ )
def step_prk(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
solution to the differential equation.
@@ -268,9 +266,7 @@ def step_prk(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
)
- diff_to_prev = (0
- if self.counter % 2 else self.config.num_train_timesteps
- // self.num_inference_steps // 2)
+ diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
prev_timestep = timestep - diff_to_prev
timestep = self.prk_timesteps[self.counter // 4 * 4]
@@ -289,21 +285,21 @@ def step_prk(
# cur_sample should not be `None`
cur_sample = self.cur_sample if self.cur_sample is not None else sample
- prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep,
- model_output)
+ prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
self.counter += 1
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def step_plms(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
times to approximate the solution.
@@ -330,18 +326,17 @@ def step_plms(
f"{self.__class__} can only be run AFTER scheduler has been run "
"in 'prk' mode for at least 12 iterations "
"See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
- "for more information.")
+ "for more information."
+ )
- prev_timestep = (timestep - self.config.num_train_timesteps //
- self.num_inference_steps)
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
if self.counter != 1:
self.ets = self.ets[-3:]
self.ets.append(model_output)
else:
prev_timestep = timestep
- timestep = (timestep + self.config.num_train_timesteps //
- self.num_inference_steps)
+ timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
if len(self.ets) == 1 and self.counter == 0:
model_output = model_output
@@ -353,23 +348,19 @@ def step_plms(
elif len(self.ets) == 2:
model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
elif len(self.ets) == 3:
- model_output = (
- 23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+ model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
else:
- model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] +
- 37 * self.ets[-3] - 9 * self.ets[-4])
+ model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
- prev_sample = self._get_prev_sample(sample, timestep, prev_timestep,
- model_output)
+ prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
self.counter += 1
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -396,14 +387,12 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
# model_output -> e_θ(x_t, t)
# prev_sample -> x_(t−δ)
alpha_prod_t = self.alphas_cumprod[timestep]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else self.final_alpha_cumprod)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
if self.config.prediction_type == "v_prediction":
- model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
- 0.5) * sample
+ model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
elif self.config.prediction_type != "epsilon":
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
@@ -413,41 +402,41 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
# denominator of x_t in formula (9) and plus 1
# Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
# sqrt(α_(t−δ)) / sqrt(α_t))
- sample_coeff = (alpha_prod_t_prev / alpha_prod_t)**(0.5)
+ sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
# corresponds to denominator of e_θ(x_t, t) in formula (9)
- model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev**(0.5) + (
- alpha_prod_t * beta_prod_t * alpha_prod_t_prev)**(0.5)
+ model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+ alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+ ) ** (0.5)
# full formula (9)
- prev_sample = (sample_coeff * sample -
- (alpha_prod_t_prev - alpha_prod_t
- ) * model_output / model_output_denom_coeff)
+ prev_sample = (
+ sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+ )
return prev_sample
# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
index 71460c026a92b..d040c40ba5124 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
@@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -109,26 +109,30 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- eta: float=0.0,
- trained_betas: Optional[np.ndarray]=None,
- clip_sample: bool=True, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ eta: float = 0.0,
+ trained_betas: Optional[np.ndarray] = None,
+ clip_sample: bool = True,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -137,8 +141,7 @@ def __init__(
betas = paddle.linspace(-6, 6, num_train_timesteps)
self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -151,14 +154,11 @@ def __init__(
# setable values
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps)[::-1].copy())
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
self.eta = eta
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -173,12 +173,12 @@ def scale_model_input(self,
return sample
def set_timesteps(
- self,
- num_inference_steps: int,
- jump_length: int=10,
- jump_n_sample: int=10, ):
- num_inference_steps = min(self.config.num_train_timesteps,
- num_inference_steps)
+ self,
+ num_inference_steps: int,
+ jump_length: int = 10,
+ jump_n_sample: int = 10,
+ ):
+ num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
self.num_inference_steps = num_inference_steps
timesteps = []
@@ -198,16 +198,14 @@ def set_timesteps(
t = t + 1
timesteps.append(t)
- timesteps = np.array(timesteps) * (self.config.num_train_timesteps //
- self.num_inference_steps)
+ timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
self.timesteps = paddle.to_tensor(timesteps)
def _get_variance(self, t):
prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else self.final_alpha_cumprod)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
@@ -218,21 +216,20 @@ def _get_variance(self, t):
# Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
# without eta.
# variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
- variance = (beta_prod_t_prev / beta_prod_t) * (
- 1 - alpha_prod_t / alpha_prod_t_prev)
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
return variance
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- original_image: paddle.Tensor,
- mask: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[RePaintSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ original_image: paddle.Tensor,
+ mask: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[RePaintSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -258,19 +255,16 @@ def step(
"""
t = timestep
- prev_timestep = (timestep - self.config.num_train_timesteps //
- self.num_inference_steps)
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
# 1. compute alphas, betas
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
- prev_timestep >= 0 else self.final_alpha_cumprod)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
# 2. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
- pred_original_sample = (
- sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+ pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
# 3. Clip "predicted x_0"
if self.config.clip_sample:
@@ -284,9 +278,8 @@ def step(
# been observed.
# 5. Add noise
- noise = randn_tensor(
- model_output.shape, generator=generator, dtype=model_output.dtype)
- std_dev_t = self.eta * self._get_variance(timestep)**0.5
+ noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
+ std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
variance = 0
if t > 0 and self.eta > 0:
@@ -294,51 +287,44 @@ def step(
# 6. compute "direction pointing to x_t" of formula (12)
# from https://arxiv.org/pdf/2010.02502.pdf
- pred_sample_direction = (
- 1 - alpha_prod_t_prev - std_dev_t**2)**0.5 * model_output
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
# 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
- prev_unknown_part = (alpha_prod_t_prev**0.5 * pred_original_sample +
- pred_sample_direction + variance)
+ prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
# 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
- prev_known_part = (alpha_prod_t_prev**0.5) * original_image + (
- (1 - alpha_prod_t_prev)**0.5) * noise
+ prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise
# 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
- pred_prev_sample = mask * prev_known_part + (1.0 - mask
- ) * prev_unknown_part
+ pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
if not return_dict:
return (
pred_prev_sample,
- pred_original_sample, )
+ pred_original_sample,
+ )
- return RePaintSchedulerOutput(
- prev_sample=pred_prev_sample,
- pred_original_sample=pred_original_sample)
+ return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
def undo_step(self, sample, timestep, generator=None):
n = self.config.num_train_timesteps // self.num_inference_steps
for i in range(n):
beta = self.betas[timestep + i]
- noise = randn_tensor(
- sample.shape, generator=generator, dtype=sample.dtype)
+ noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
# 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
- sample = (1 - beta)**0.5 * sample + beta**0.5 * noise
+ sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
return sample
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
- raise NotImplementedError(
- "Use `DDPMScheduler.add_noise()` to train for sampling with RePaint."
- )
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
+ raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
def __len__(self):
return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
index 3513d6691d0e5..83644fdecc48a 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
@@ -71,13 +71,14 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=2000,
- snr: float=0.15,
- sigma_min: float=0.01,
- sigma_max: float=1348.0,
- sampling_eps: float=1e-5,
- correct_steps: int=1, ):
+ self,
+ num_train_timesteps: int = 2000,
+ snr: float = 0.15,
+ sigma_min: float = 0.01,
+ sigma_max: float = 1348.0,
+ sampling_eps: float = 1e-5,
+ correct_steps: int = 1,
+ ):
# standard deviation of the initial noise distribution
self.init_noise_sigma = sigma_max
@@ -86,9 +87,7 @@ def __init__(
self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -102,7 +101,7 @@ def scale_model_input(self,
"""
return sample
- def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None):
+ def set_timesteps(self, num_inference_steps: int, sampling_eps: float = None):
"""
Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -113,17 +112,17 @@ def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None):
final timestep value (overrides value given at Scheduler instantiation).
"""
- sampling_eps = (sampling_eps if sampling_eps is not None else
- self.config.sampling_eps)
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
self.timesteps = paddle.linspace(1, sampling_eps, num_inference_steps)
def set_sigmas(
- self,
- num_inference_steps: int,
- sigma_min: float=None,
- sigma_max: float=None,
- sampling_eps: float=None, ):
+ self,
+ num_inference_steps: int,
+ sigma_min: float = None,
+ sigma_max: float = None,
+ sampling_eps: float = None,
+ ):
"""
Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
@@ -142,33 +141,31 @@ def set_sigmas(
"""
sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
- sampling_eps = (sampling_eps if sampling_eps is not None else
- self.config.sampling_eps)
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
if self.timesteps is None:
self.set_timesteps(num_inference_steps, sampling_eps)
- self.sigmas = sigma_min * (sigma_max / sigma_min)**(self.timesteps /
- sampling_eps)
+ self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
self.discrete_sigmas = paddle.exp(
- paddle.linspace(
- math.log(sigma_min), math.log(sigma_max), num_inference_steps))
- self.sigmas = paddle.to_tensor(
- [sigma_min * (sigma_max / sigma_min)**t for t in self.timesteps])
+ paddle.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps)
+ )
+ self.sigmas = paddle.to_tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
def get_adjacent_sigma(self, timesteps, t):
return paddle.where(
timesteps == 0,
paddle.zeros_like(t),
- self.discrete_sigmas[timesteps - 1], )
+ self.discrete_sigmas[timesteps - 1],
+ )
def step_pred(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[SdeVeOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[SdeVeOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -191,15 +188,13 @@ def step_pred(
"`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
)
- timestep = timestep * paddle.ones(
- (sample.shape[0],
- )) # paddle.repeat_interleave(timestep, sample.shape[0])
+ timestep = timestep * paddle.ones((sample.shape[0],)) # paddle.repeat_interleave(timestep, sample.shape[0])
timesteps = (timestep * (len(self.timesteps) - 1)).cast("int64")
sigma = self.discrete_sigmas[timesteps]
adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep)
drift = paddle.zeros_like(sample)
- diffusion = (sigma**2 - adjacent_sigma**2)**0.5
+ diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
# equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
# also equation 47 shows the analog from SDE models to ancestral sampling methods
@@ -209,28 +204,23 @@ def step_pred(
drift = drift - diffusion**2 * model_output
# equation 6: sample noise for the diffusion term of
- noise = randn_tensor(
- sample.shape, generator=generator, dtype=sample.dtype)
- prev_sample_mean = (
- sample - drift
- ) # subtract because `dt` is a small negative timestep
+ noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
+ prev_sample_mean = sample - drift # subtract because `dt` is a small negative timestep
# TODO is the variable diffusion the correct scaling term for the noise?
- prev_sample = (prev_sample_mean + diffusion * noise
- ) # add impact of diffusion field g
+ prev_sample = prev_sample_mean + diffusion * noise # add impact of diffusion field g
if not return_dict:
return (prev_sample, prev_sample_mean)
- return SdeVeOutput(
- prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
+ return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
def step_correct(
- self,
- model_output: paddle.Tensor,
- sample: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
after making the prediction for the previous timestep.
@@ -257,12 +247,10 @@ def step_correct(
noise = randn_tensor(sample.shape, generator=generator)
# compute step size from the model_output, the noise, and the snr
- grad_norm = paddle.norm(
- model_output.reshape([model_output.shape[0], -1]), axis=-1).mean()
- noise_norm = paddle.norm(
- noise.reshape([noise.shape[0], -1]), axis=-1).mean()
- step_size = (self.config.snr * noise_norm / grad_norm)**2 * 2
- step_size = step_size * paddle.ones((sample.shape[0], ))
+ grad_norm = paddle.norm(model_output.reshape([model_output.shape[0], -1]), axis=-1).mean()
+ noise_norm = paddle.norm(noise.reshape([noise.shape[0], -1]), axis=-1).mean()
+ step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+ step_size = step_size * paddle.ones((sample.shape[0],))
# self.repeat_scalar(step_size, sample.shape[0])
# compute corrected sample: model_output term and noise term
@@ -270,23 +258,22 @@ def step_correct(
while len(step_size.shape) < len(sample.shape):
step_size = step_size.unsqueeze(-1)
prev_sample_mean = sample + step_size * model_output
- prev_sample = prev_sample_mean + ((step_size * 2)**0.5) * noise
+ prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure sigmas and timesteps have the same dtype as original_samples
sigmas = self.discrete_sigmas[timesteps]
- noise = (paddle.randn(
- original_samples.shape,
- dtype=original_samples.dtype) * sigmas[:, None, None, None])
+ noise = paddle.randn(original_samples.shape, dtype=original_samples.dtype) * sigmas[:, None, None, None]
noisy_samples = noise + original_samples
return noisy_samples
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
index 23b4303cbf257..c0e1eebc3eb96 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
@@ -42,18 +42,13 @@ class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
order = 1
@register_to_config
- def __init__(self,
- num_train_timesteps=2000,
- beta_min=0.1,
- beta_max=20,
- sampling_eps=1e-3):
+ def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
self.sigmas = None
self.discrete_sigmas = None
self.timesteps = None
def set_timesteps(self, num_inference_steps):
- self.timesteps = paddle.linspace(1, self.config.sampling_eps,
- num_inference_steps)
+ self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps)
def step_pred(self, score, x, t, generator=None):
if self.timesteps is None:
@@ -63,9 +58,9 @@ def step_pred(self, score, x, t, generator=None):
# TODO(Patrick) better comments + non-Paddle
# postprocess model score
- log_mean_coeff = (-0.25 * t**2 *
- (self.config.beta_max - self.config.beta_min
- ) - 0.5 * t * self.config.beta_min)
+ log_mean_coeff = (
+ -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+ )
std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff))
std = std.flatten()
while len(std.shape) < len(score.shape):
@@ -75,8 +70,7 @@ def step_pred(self, score, x, t, generator=None):
# compute
dt = -1.0 / len(self.timesteps)
- beta_t = self.config.beta_min + t * (self.config.beta_max -
- self.config.beta_min)
+ beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
beta_t = beta_t.flatten()
while len(beta_t.shape) < len(x.shape):
beta_t = beta_t.unsqueeze(-1)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
index 8b809e90c7159..491409f76a5e6 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
@@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -102,17 +102,16 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- variance_type: str="fixed_small_log",
- clip_sample: bool=True,
- clip_sample_range: Optional[float]=1.0,
- prediction_type: str="epsilon",
- beta_schedule: str="squaredcos_cap_v2", ):
+ self,
+ num_train_timesteps: int = 1000,
+ variance_type: str = "fixed_small_log",
+ clip_sample: bool = True,
+ clip_sample_range: Optional[float] = 1.0,
+ prediction_type: str = "epsilon",
+ beta_schedule: str = "squaredcos_cap_v2",
+ ):
if beta_schedule != "squaredcos_cap_v2":
- raise ValueError(
- "UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'"
- )
+ raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -125,14 +124,11 @@ def __init__(
# setable values
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps)[::-1].copy())
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
self.variance_type = variance_type
- def scale_model_input(self,
- sample: paddle.Tensor,
- timestep: Optional[int]=None) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -159,23 +155,16 @@ def set_timesteps(self, num_inference_steps: int):
the number of diffusion steps used when generating samples with a pre-trained model.
"""
self.num_inference_steps = num_inference_steps
- step_ratio = (self.config.num_train_timesteps - 1) / (
- self.num_inference_steps - 1)
- timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
- .round()[::-1].copy().astype(np.int64))
+ step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1)
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
self.timesteps = paddle.to_tensor(timesteps)
- def _get_variance(self,
- t,
- prev_timestep=None,
- predicted_variance=None,
- variance_type=None):
+ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
if prev_timestep is None:
prev_timestep = t - 1
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
- if prev_timestep >= 0 else self.one)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
@@ -207,13 +196,14 @@ def _get_variance(self,
return variance
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- prev_timestep: Optional[int]=None,
- generator=None,
- return_dict: bool=True, ) -> Union[UnCLIPSchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ prev_timestep: Optional[int] = None,
+ generator=None,
+ return_dict: bool = True,
+ ) -> Union[UnCLIPSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
process from the learned model outputs (most often the predicted noise).
@@ -236,12 +226,11 @@ def step(
"""
t = timestep
- if (model_output.shape[1] == sample.shape[1] * 2 and
- self.variance_type == "learned_range"):
+ if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range":
# must split like this, 3 -> split 2 -> [2, 1]
model_output, predicted_variance = model_output.split(
- [sample.shape[1], model_output.shape[1] - sample.shape[1]],
- axis=1)
+ [sample.shape[1], model_output.shape[1] - sample.shape[1]], axis=1
+ )
else:
predicted_variance = None
@@ -250,8 +239,7 @@ def step(
prev_timestep = t - 1
alpha_prod_t = self.alphas_cumprod[t]
- alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
- if prev_timestep >= 0 else self.one)
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
@@ -265,32 +253,31 @@ def step(
# 2. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
if self.config.prediction_type == "epsilon":
- pred_original_sample = (sample - beta_prod_t**
- (0.5) * model_output) / alpha_prod_t**(0.5)
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`"
- " for the UnCLIPScheduler.")
+ " for the UnCLIPScheduler."
+ )
# 3. Clip "predicted x_0"
if self.config.clip_sample:
pred_original_sample = paddle.clip(
pred_original_sample,
-self.config.clip_sample_range,
- self.config.clip_sample_range, )
+ self.config.clip_sample_range,
+ )
# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
- pred_original_sample_coeff = (alpha_prod_t_prev
- **(0.5) * beta) / beta_prod_t
- current_sample_coeff = alpha**(0.5) * beta_prod_t_prev / beta_prod_t
+ pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
+ current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t
# 5. Compute predicted previous sample µ_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
- pred_prev_sample = (pred_original_sample_coeff * pred_original_sample +
- current_sample_coeff * sample)
+ pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
# 6. Add noise
variance = 0
@@ -298,12 +285,14 @@ def step(
variance_noise = randn_tensor(
model_output.shape,
dtype=model_output.dtype,
- generator=generator, )
+ generator=generator,
+ )
variance = self._get_variance(
t,
predicted_variance=predicted_variance,
- prev_timestep=prev_timestep, )
+ prev_timestep=prev_timestep,
+ )
if self.variance_type == "fixed_small_log":
variance = variance
@@ -312,15 +301,14 @@ def step(
else:
raise ValueError(
f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`"
- " for the UnCLIPScheduler.")
+ " for the UnCLIPScheduler."
+ )
variance = variance * variance_noise
pred_prev_sample = pred_prev_sample + variance
if not return_dict:
- return (pred_prev_sample, )
+ return (pred_prev_sample,)
- return UnCLIPSchedulerOutput(
- prev_sample=pred_prev_sample,
- pred_original_sample=pred_original_sample)
+ return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
index 4fb50fb0e19c2..fa85c31efc8c1 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
@@ -23,8 +23,7 @@
import paddle
from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
- SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
def alpha_bar(time_step):
- return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
@@ -126,40 +125,43 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_train_timesteps: int=1000,
- beta_start: float=0.0001,
- beta_end: float=0.02,
- beta_schedule: str="linear",
- trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
- solver_order: int=2,
- prediction_type: str="epsilon",
- thresholding: bool=False,
- dynamic_thresholding_ratio: float=0.995,
- sample_max_value: float=1.0,
- predict_x0: bool=True,
- solver_type: str="bh2",
- lower_order_final: bool=True,
- disable_corrector: List[int]=[],
- solver_p: SchedulerMixin=None, ):
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ solver_order: int = 2,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ predict_x0: bool = True,
+ solver_type: str = "bh2",
+ lower_order_final: bool = True,
+ disable_corrector: List[int] = [],
+ solver_p: SchedulerMixin = None,
+ ):
if trained_betas is not None:
self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
elif beta_schedule == "linear":
- self.betas = paddle.linspace(
- beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
- self.betas = (paddle.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_train_timesteps,
- dtype=paddle.float32, )**2)
+ self.betas = (
+ paddle.linspace(
+ beta_start**0.5,
+ beta_end**0.5,
+ num_train_timesteps,
+ dtype=paddle.float32,
+ )
+ ** 2
+ )
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
- raise NotImplementedError(
- f"{beta_schedule} does is not implemented for {self.__class__}")
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -175,16 +177,12 @@ def __init__(
if solver_type in ["midpoint", "heun", "logrho"]:
self.register_to_config(solver_type="bh1")
else:
- raise NotImplementedError(
- f"{solver_type} does is not implemented for {self.__class__}"
- )
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
self.predict_x0 = predict_x0
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(
- 0, num_train_timesteps - 1, num_train_timesteps,
- dtype=np.float32)[::-1].copy()
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = paddle.to_tensor(timesteps)
self.model_outputs = [None] * solver_order
self.timestep_list = [None] * solver_order
@@ -201,9 +199,12 @@ def set_timesteps(self, num_inference_steps: int):
num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model.
"""
- timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
- num_inference_steps + 1).round()[::-1][:-1]
- .copy().astype(np.int64))
+ timesteps = (
+ np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .copy()
+ .astype(np.int64)
+ )
# when num_inference_steps == num_train_timesteps, we can end up with
# duplicates in timesteps.
@@ -214,7 +215,9 @@ def set_timesteps(self, num_inference_steps: int):
self.num_inference_steps = len(timesteps)
- self.model_outputs = [None, ] * self.config.solver_order
+ self.model_outputs = [
+ None,
+ ] * self.config.solver_order
self.lower_order_nums = 0
self.last_sample = None
if self.solver_p:
@@ -242,8 +245,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
- s = paddle.quantile(
- abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+ s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
# paddle.clip donot support min > max
if self.config.sample_max_value < 1:
s = paddle.ones_like(s) * self.config.sample_max_value
@@ -251,21 +253,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
s = paddle.clip(
s, min=1, max=self.config.sample_max_value
) # When clip to min=1, equivalent to standard clipping to [-1, 1]
- s = s.unsqueeze(
- 1) # (batch_size, 1) because clip will broadcast along axis=0
- sample = (
- paddle.clip(sample, -s, s) /
- s) # "we threshold xt0 to the range [-s, s] and then divide by s"
+ s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0
+ sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = paddle.reshape(sample, [batch_size, channels, height, width])
sample = paddle.cast(sample, dtype)
return sample
- def convert_model_output(self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor) -> paddle.Tensor:
+ def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
r"""
Convert the model output to the corresponding type that the algorithm PC needs.
@@ -280,19 +276,18 @@ def convert_model_output(self,
"""
if self.predict_x0:
if self.config.prediction_type == "epsilon":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
x0_pred = model_output
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = alpha_t * sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the UniPCMultistepScheduler.")
+ " `v_prediction` for the UniPCMultistepScheduler."
+ )
if self.config.thresholding:
x0_pred = self._threshold_sample(x0_pred)
@@ -302,26 +297,26 @@ def convert_model_output(self,
if self.config.prediction_type == "epsilon":
return model_output
elif self.config.prediction_type == "sample":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = (sample - alpha_t * model_output) / sigma_t
return epsilon
elif self.config.prediction_type == "v_prediction":
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
- timestep]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = alpha_t * model_output + sigma_t * sample
return epsilon
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
- " `v_prediction` for the UniPCMultistepScheduler.")
+ " `v_prediction` for the UniPCMultistepScheduler."
+ )
def multistep_uni_p_bh_update(
- self,
- model_output: paddle.Tensor,
- prev_timestep: int,
- sample: paddle.Tensor,
- order: int, ) -> paddle.Tensor:
+ self,
+ model_output: paddle.Tensor,
+ prev_timestep: int,
+ sample: paddle.Tensor,
+ order: int,
+ ) -> paddle.Tensor:
"""
One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
@@ -424,12 +419,13 @@ def multistep_uni_p_bh_update(
return x_t
def multistep_uni_c_bh_update(
- self,
- this_model_output: paddle.Tensor,
- this_timestep: int,
- last_sample: paddle.Tensor,
- this_sample: paddle.Tensor,
- order: int, ) -> paddle.Tensor:
+ self,
+ this_model_output: paddle.Tensor,
+ this_timestep: int,
+ last_sample: paddle.Tensor,
+ this_sample: paddle.Tensor,
+ order: int,
+ ) -> paddle.Tensor:
"""
One step for the UniC (B(h) version).
@@ -512,8 +508,7 @@ def multistep_uni_c_bh_update(
if self.predict_x0:
x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
if D1s is not None:
- corr_res = paddle.einsum("k,bkchw->bchw",
- rhos_c[:-1].squeeze(1), D1s)
+ corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
else:
corr_res = 0
D1_t = model_t - m0
@@ -521,8 +516,7 @@ def multistep_uni_c_bh_update(
else:
x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
if D1s is not None:
- corr_res = paddle.einsum("k,bkchw->bchw",
- rhos_c[:-1].squeeze(1), D1s)
+ corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
else:
corr_res = 0
D1_t = model_t - m0
@@ -531,11 +525,12 @@ def multistep_uni_c_bh_update(
return x_t
def step(
- self,
- model_output: paddle.Tensor,
- timestep: int,
- sample: paddle.Tensor,
- return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: int,
+ sample: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
"""
Step function propagating the sample with the multistep UniPC.
@@ -563,23 +558,22 @@ def step(
else:
step_index = step_index.item()
- use_corrector = (step_index > 0 and
- step_index - 1 not in self.disable_corrector and
- self.last_sample is not None)
+ use_corrector = (
+ step_index > 0 and step_index - 1 not in self.disable_corrector and self.last_sample is not None
+ )
- model_output_convert = self.convert_model_output(model_output, timestep,
- sample)
+ model_output_convert = self.convert_model_output(model_output, timestep, sample)
if use_corrector:
sample = self.multistep_uni_c_bh_update(
this_model_output=model_output_convert,
this_timestep=timestep,
last_sample=self.last_sample,
this_sample=sample,
- order=self.this_order, )
+ order=self.this_order,
+ )
# now prepare to run the predictor
- prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
- self.timesteps[step_index + 1])
+ prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
for i in range(self.config.solver_order - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
@@ -589,13 +583,11 @@ def step(
self.timestep_list[-1] = timestep
if self.config.lower_order_final:
- this_order = min(self.config.solver_order,
- len(self.timesteps) - step_index)
+ this_order = min(self.config.solver_order, len(self.timesteps) - step_index)
else:
this_order = self.config.solver_order
- self.this_order = min(this_order,
- self.lower_order_nums + 1) # warmup for multistep
+ self.this_order = min(this_order, self.lower_order_nums + 1) # warmup for multistep
assert self.this_order > 0
self.last_sample = sample
@@ -603,18 +595,18 @@ def step(
model_output=model_output, # pass the original non-converted model output, in case solver-p is used
prev_timestep=prev_timestep,
sample=sample,
- order=self.this_order, )
+ order=self.this_order,
+ )
if self.lower_order_nums < self.config.solver_order:
self.lower_order_nums += 1
if not return_dict:
- return (prev_sample, )
+ return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
- def scale_model_input(self, sample: paddle.Tensor, *args,
- **kwargs) -> paddle.Tensor:
+ def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
@@ -629,26 +621,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
def add_noise(
- self,
- original_samples: paddle.Tensor,
- noise: paddle.Tensor,
- timesteps: paddle.Tensor, ) -> paddle.Tensor:
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
# Make sure alphas_cumprod and timestep have same dtype as original_samples
alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
- sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
- while len(sqrt_one_minus_alpha_prod.shape) < len(
- original_samples.shape):
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
- noisy_samples = (sqrt_alpha_prod * original_samples +
- sqrt_one_minus_alpha_prod * noise)
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
index 96707f403e49a..d5bcdca0d1a9f 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
@@ -76,11 +76,12 @@ class SchedulerMixin:
@classmethod
def from_pretrained(
- cls,
- pretrained_model_name_or_path: Dict[str, Any]=None,
- subfolder: Optional[str]=None,
- return_unused_kwargs: bool=False,
- **kwargs, ):
+ cls,
+ pretrained_model_name_or_path: Dict[str, Any] = None,
+ subfolder: Optional[str] = None,
+ return_unused_kwargs: bool = False,
+ **kwargs,
+ ):
r"""
Instantiate a Scheduler class from a pre-defined JSON configuration file inside a directory or Hub repo.
@@ -142,15 +143,16 @@ def from_pretrained(
subfolder=subfolder,
return_unused_kwargs=True,
return_commit_hash=True,
- **kwargs, )
- return cls.from_config(
- config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+ **kwargs,
+ )
+ return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- push_to_hub: bool=False,
- **kwargs, ):
+ self,
+ save_directory: Union[str, os.PathLike],
+ push_to_hub: bool = False,
+ **kwargs,
+ ):
"""
Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
[`~SchedulerMixin.from_pretrained`] class method.
@@ -159,8 +161,7 @@ def save_pretrained(
save_directory (`str` or `os.PathLike`):
Directory where the configuration JSON file will be saved (will be created if it does not exist).
"""
- self.save_config(
- save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+ self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
@property
def compatibles(self):
@@ -177,7 +178,6 @@ def _get_compatibles(cls):
compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
diffusers_library = importlib.import_module(__name__.split(".")[0])
compatible_classes = [
- getattr(diffusers_library, c) for c in compatible_classes_str
- if hasattr(diffusers_library, c)
+ getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
]
return compatible_classes
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
index 71ee1bc4ad4e8..f9f3c34bba785 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
@@ -69,8 +69,7 @@ def index_to_log_onehot(x: paddle.Tensor, num_classes: int) -> paddle.Tensor:
return log_x
-def gumbel_noised(logits: paddle.Tensor,
- generator: Optional[paddle.Generator]) -> paddle.Tensor:
+def gumbel_noised(logits: paddle.Tensor, generator: Optional[paddle.Generator]) -> paddle.Tensor:
"""
Apply gumbel noise to `logits`
"""
@@ -80,34 +79,32 @@ def gumbel_noised(logits: paddle.Tensor,
return noised
-def alpha_schedules(num_diffusion_timesteps: int,
- alpha_cum_start=0.99999,
- alpha_cum_end=0.000009):
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
"""
Cumulative and non-cumulative alpha schedules.
See section 4.1.
"""
- att = (np.arange(0, num_diffusion_timesteps) /
- (num_diffusion_timesteps - 1) *
- (alpha_cum_end - alpha_cum_start) + alpha_cum_start)
+ att = (
+ np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+ + alpha_cum_start
+ )
att = np.concatenate(([1], att))
at = att[1:] / att[:-1]
att = np.concatenate((att[1:], [1]))
return at, att
-def gamma_schedules(num_diffusion_timesteps: int,
- gamma_cum_start=0.000009,
- gamma_cum_end=0.99999):
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
"""
Cumulative and non-cumulative gamma schedules.
See section 4.1.
"""
- ctt = (np.arange(0, num_diffusion_timesteps) /
- (num_diffusion_timesteps - 1) *
- (gamma_cum_end - gamma_cum_start) + gamma_cum_start)
+ ctt = (
+ np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+ + gamma_cum_start
+ )
ctt = np.concatenate(([0], ctt))
one_minus_ctt = 1 - ctt
one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
@@ -155,13 +152,14 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- num_vec_classes: int,
- num_train_timesteps: int=100,
- alpha_cum_start: float=0.99999,
- alpha_cum_end: float=0.000009,
- gamma_cum_start: float=0.000009,
- gamma_cum_end: float=0.99999, ):
+ self,
+ num_vec_classes: int,
+ num_train_timesteps: int = 100,
+ alpha_cum_start: float = 0.99999,
+ alpha_cum_end: float = 0.000009,
+ gamma_cum_start: float = 0.000009,
+ gamma_cum_end: float = 0.99999,
+ ):
self.num_embed = num_vec_classes
# By convention, the index for the mask class is the last class index
@@ -170,11 +168,13 @@ def __init__(
at, att = alpha_schedules(
num_train_timesteps,
alpha_cum_start=alpha_cum_start,
- alpha_cum_end=alpha_cum_end, )
+ alpha_cum_end=alpha_cum_end,
+ )
ct, ctt = gamma_schedules(
num_train_timesteps,
gamma_cum_start=gamma_cum_start,
- gamma_cum_end=gamma_cum_end, )
+ gamma_cum_end=gamma_cum_end,
+ )
num_non_mask_classes = self.num_embed - 1
bt = (1 - at - ct) / num_non_mask_classes
@@ -203,8 +203,7 @@ def __init__(
# setable values
self.num_inference_steps = None
- self.timesteps = paddle.to_tensor(
- np.arange(0, num_train_timesteps)[::-1].copy())
+ self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
def set_timesteps(self, num_inference_steps: int):
"""
@@ -219,14 +218,13 @@ def set_timesteps(self, num_inference_steps: int):
self.timesteps = paddle.to_tensor(timesteps)
def step(
- self,
- model_output: paddle.Tensor,
- timestep: paddle.Tensor,
- sample: paddle.Tensor,
- generator: Optional[Union[paddle.Generator, List[
- paddle.Generator]]]=None,
- return_dict: bool=True, ) -> Union[VQDiffusionSchedulerOutput,
- Tuple]:
+ self,
+ model_output: paddle.Tensor,
+ timestep: paddle.Tensor,
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
"""
Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the
docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed.
@@ -263,7 +261,7 @@ def step(
x_t_min_1 = log_p_x_t_min_1.argmax(axis=1)
if not return_dict:
- return (x_t_min_1, )
+ return (x_t_min_1,)
return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
@@ -299,10 +297,12 @@ def q_posterior(self, log_p_x_0, x_t, t):
log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
- t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True)
+ t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+ )
log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
- t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False)
+ t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+ )
# p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) ... p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
# . . .
@@ -384,12 +384,9 @@ def q_posterior(self, log_p_x_0, x_t, t):
# The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
return log_p_x_t_min_1
- def log_Q_t_transitioning_to_known_class(self,
- *,
- t: paddle.Tensor,
- x_t: paddle.Tensor,
- log_onehot_x_t: paddle.Tensor,
- cumulative: bool):
+ def log_Q_t_transitioning_to_known_class(
+ self, *, t: paddle.Tensor, x_t: paddle.Tensor, log_onehot_x_t: paddle.Tensor, cumulative: bool
+ ):
"""
Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
latent pixel in `x_t`.
@@ -462,9 +459,7 @@ def log_Q_t_transitioning_to_known_class(self,
#
# `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
# if x_t is masked
- log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:,
- -1, :].unsqueeze(
- 1)
+ log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
# `index_to_log_onehot` will add onehot vectors for masked pixels,
# so the default one hot matrix has one too many rows. See the doc string
@@ -486,14 +481,12 @@ def log_Q_t_transitioning_to_known_class(self,
# The whole column of each masked pixel is `c`
mask_class_mask = x_t == self.mask_class
- mask_class_mask = mask_class_mask.unsqueeze(1).expand(
- [-1, self.num_embed - 1, -1])
+ mask_class_mask = mask_class_mask.unsqueeze(1).expand([-1, self.num_embed - 1, -1])
# log_Q_t[mask_class_mask] = c
log_Q_t = paddle.where(mask_class_mask, c, log_Q_t)
if not cumulative:
- log_Q_t = paddle.concat(
- (log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1)
+ log_Q_t = paddle.concat((log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1)
return log_Q_t
diff --git a/ppdiffusers/ppdiffusers/training_utils.py b/ppdiffusers/ppdiffusers/training_utils.py
index dba0703882e22..32a8251578ea7 100644
--- a/ppdiffusers/ppdiffusers/training_utils.py
+++ b/ppdiffusers/ppdiffusers/training_utils.py
@@ -67,17 +67,18 @@ class EMAModel:
"""
def __init__(
- self,
- parameters,
- decay: float=0.9999,
- min_decay: float=0.0,
- update_after_step: int=0,
- use_ema_warmup: bool=False,
- inv_gamma: Union[float, int]=1.0,
- power: Union[float, int]=2 / 3,
- model_cls: Optional[Any]=None,
- model_config: Dict[str, Any]=None,
- **kwargs, ):
+ self,
+ parameters,
+ decay: float = 0.9999,
+ min_decay: float = 0.0,
+ update_after_step: int = 0,
+ use_ema_warmup: bool = False,
+ inv_gamma: Union[float, int] = 1.0,
+ power: Union[float, int] = 2 / 3,
+ model_cls: Optional[Any] = None,
+ model_config: Dict[str, Any] = None,
+ **kwargs,
+ ):
"""
Args:
parameters (Iterable[nn.Parameter]): The parameters to track.
@@ -99,39 +100,35 @@ def __init__(
if isinstance(parameters, nn.Layer):
deprecation_message = (
"Passing a `nn.Layer` to `ExponentialMovingAverage` is deprecated. "
- "Please pass the parameters of the module instead.")
+ "Please pass the parameters of the module instead."
+ )
deprecate(
"passing a `nn.Layer` to `ExponentialMovingAverage`",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
parameters = parameters.parameters()
# set use_ema_warmup to True if a nn.Layer is passed for backwards compatibility
use_ema_warmup = True
if kwargs.get("max_value", None) is not None:
- deprecation_message = (
- "The `max_value` argument is deprecated. Please use `decay` instead."
- )
- deprecate(
- "max_value", "1.0.0", deprecation_message, standard_warn=False)
+ deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
+ deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
decay = kwargs["max_value"]
if kwargs.get("min_value", None) is not None:
deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
- deprecate(
- "min_value", "1.0.0", deprecation_message, standard_warn=False)
+ deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
min_decay = kwargs["min_value"]
parameters = list(parameters)
self.shadow_params = [p.clone().detach() for p in parameters]
if kwargs.get("device", None) is not None:
- deprecation_message = (
- "The `device` argument is deprecated. Please use `to` instead.")
- deprecate(
- "device", "1.0.0", deprecation_message, standard_warn=False)
+ deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
+ deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
self.to(device=kwargs["device"])
self.temp_stored_params = None
@@ -153,23 +150,17 @@ def from_pretrained(cls, path, model_cls) -> "EMAModel":
_, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
model = model_cls.from_pretrained(path)
- ema_model = cls(model.parameters(),
- model_cls=model_cls,
- model_config=model.config)
+ ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config)
ema_model.load_state_dict(ema_kwargs)
return ema_model
def save_pretrained(self, path):
if self.model_cls is None:
- raise ValueError(
- "`save_pretrained` can only be used if `model_cls` was defined at __init__."
- )
+ raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
if self.model_config is None:
- raise ValueError(
- "`save_pretrained` can only be used if `model_config` was defined at __init__."
- )
+ raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
model = self.model_cls.from_config(self.model_config)
state_dict = self.state_dict()
@@ -190,7 +181,7 @@ def get_decay(self, optimization_step: int) -> float:
return 0.0
if self.use_ema_warmup:
- cur_decay_value = 1 - (1 + step / self.inv_gamma)**-self.power
+ cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
else:
cur_decay_value = (1 + step) / (10 + step)
@@ -204,12 +195,14 @@ def step(self, parameters):
if isinstance(parameters, nn.Layer):
deprecation_message = (
"Passing a `nn.Layer` to `ExponentialMovingAverage.step` is deprecated. "
- "Please pass the parameters of the module instead.")
+ "Please pass the parameters of the module instead."
+ )
deprecate(
"passing a `nn.Layer` to `ExponentialMovingAverage.step`",
"1.0.0",
deprecation_message,
- standard_warn=False, )
+ standard_warn=False,
+ )
parameters = parameters.parameters()
parameters = list(parameters)
@@ -223,8 +216,7 @@ def step(self, parameters):
for s_param, param in zip(self.shadow_params, parameters):
if not param.stop_gradient:
- s_param.copy_(s_param - one_minus_decay * (s_param - param),
- True)
+ s_param.copy_(s_param - one_minus_decay * (s_param - param), True)
else:
s_param.copy_(param, True)
@@ -267,9 +259,7 @@ def store(self, parameters) -> None:
parameters: Iterable of `nn.Parameter`; the parameters to be
temporarily stored.
"""
- self.temp_stored_params = [
- param.detach().cpu().clone() for param in parameters
- ]
+ self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
def restore(self, parameters) -> None:
r"""
@@ -282,9 +272,7 @@ def restore(self, parameters) -> None:
`ExponentialMovingAverage` was initialized will be used.
"""
if self.temp_stored_params is None:
- raise RuntimeError(
- "This ExponentialMovingAverage has no `store()`ed weights "
- "to `restore()`")
+ raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
for c_param, param in zip(self.temp_stored_params, parameters):
param.copy_(c_param, True)
@@ -310,18 +298,15 @@ def load_state_dict(self, state_dict: dict) -> None:
if not isinstance(self.min_decay, float):
raise ValueError("Invalid min_decay")
- self.optimization_step = state_dict.get("optimization_step",
- self.optimization_step)
+ self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
if not isinstance(self.optimization_step, int):
raise ValueError("Invalid optimization_step")
- self.update_after_step = state_dict.get("update_after_step",
- self.update_after_step)
+ self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
if not isinstance(self.update_after_step, int):
raise ValueError("Invalid update_after_step")
- self.use_ema_warmup = state_dict.get("use_ema_warmup",
- self.use_ema_warmup)
+ self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
if not isinstance(self.use_ema_warmup, bool):
raise ValueError("Invalid use_ema_warmup")
@@ -338,8 +323,7 @@ def load_state_dict(self, state_dict: dict) -> None:
self.shadow_params = shadow_params
if not isinstance(self.shadow_params, list):
raise ValueError("shadow_params must be a list")
- if not all(
- isinstance(p, paddle.Tensor) for p in self.shadow_params):
+ if not all(isinstance(p, paddle.Tensor) for p in self.shadow_params):
raise ValueError("shadow_params must all be Tensors")
@@ -353,17 +337,13 @@ def main_process_first(desc="work"):
try:
if not is_main_process:
# tell all replicas to wait
- logger.debug(
- f"{rank}: waiting for the {main_process_desc} to perform {desc}"
- )
+ logger.debug(f"{rank}: waiting for the {main_process_desc} to perform {desc}")
paddle.distributed.barrier()
yield
finally:
if is_main_process:
# the wait is over
- logger.debug(
- f"{rank}: {main_process_desc} completed {desc}, releasing all replicas"
- )
+ logger.debug(f"{rank}: {main_process_desc} completed {desc}, releasing all replicas")
paddle.distributed.barrier()
else:
yield
diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py
index 93a62dd290d7b..4b5b8ba7e4234 100644
--- a/ppdiffusers/ppdiffusers/utils/__init__.py
+++ b/ppdiffusers/ppdiffusers/utils/__init__.py
@@ -20,33 +20,78 @@
from ..version import VERSION as __version__
from . import initializer_utils
from .constants import (
- CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, DOWNLOAD_SERVER,
- FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME, FLAX_WEIGHTS_NAME,
- FROM_DIFFUSERS, FROM_HF_HUB, HF_MODULES_CACHE,
- HUGGINGFACE_CO_RESOLVE_ENDPOINT, LOW_CPU_MEM_USAGE_DEFAULT, NEG_INF,
- ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PADDLE_WEIGHTS_NAME,
- PPDIFFUSERS_CACHE, PPDIFFUSERS_DYNAMIC_MODULE_NAME,
- PPDIFFUSERS_MODULES_CACHE, PPNLP_BOS_RESOLVE_ENDPOINT, TEST_DOWNLOAD_SERVER,
- TEXT_ENCODER_ATTN_MODULE, TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME,
- TORCH_WEIGHTS_NAME, WEIGHTS_NAME, get_map_location_default, str2bool)
+ CONFIG_NAME,
+ DEPRECATED_REVISION_ARGS,
+ DIFFUSERS_CACHE,
+ DOWNLOAD_SERVER,
+ FASTDEPLOY_MODEL_NAME,
+ FASTDEPLOY_WEIGHTS_NAME,
+ FLAX_WEIGHTS_NAME,
+ FROM_DIFFUSERS,
+ FROM_HF_HUB,
+ HF_MODULES_CACHE,
+ HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+ LOW_CPU_MEM_USAGE_DEFAULT,
+ NEG_INF,
+ ONNX_EXTERNAL_WEIGHTS_NAME,
+ ONNX_WEIGHTS_NAME,
+ PADDLE_WEIGHTS_NAME,
+ PPDIFFUSERS_CACHE,
+ PPDIFFUSERS_DYNAMIC_MODULE_NAME,
+ PPDIFFUSERS_MODULES_CACHE,
+ PPNLP_BOS_RESOLVE_ENDPOINT,
+ TEST_DOWNLOAD_SERVER,
+ TEXT_ENCODER_ATTN_MODULE,
+ TO_DIFFUSERS,
+ TORCH_SAFETENSORS_WEIGHTS_NAME,
+ TORCH_WEIGHTS_NAME,
+ WEIGHTS_NAME,
+ get_map_location_default,
+ str2bool,
+)
from .deprecation_utils import deprecate
from .doc_utils import replace_example_docstring
-from .download_utils import (_add_variant, _get_model_file, bos_hf_download,
- ppdiffusers_bos_dir_download,
- ppdiffusers_url_download)
+from .download_utils import (
+ _add_variant,
+ _get_model_file,
+ bos_hf_download,
+ ppdiffusers_bos_dir_download,
+ ppdiffusers_url_download,
+)
from .dynamic_modules_utils import get_class_from_dynamic_module
from .hub_utils import HF_HUB_OFFLINE, extract_commit_hash, http_user_agent
from .import_utils import (
- BACKENDS_MAPPING, ENV_VARS_TRUE_AND_AUTO_VALUES, ENV_VARS_TRUE_VALUES,
- DummyObject, OptionalDependencyNotAvailable, is_bs4_available,
- is_einops_available, is_fastdeploy_available, is_ftfy_available,
- is_inflect_available, is_k_diffusion_available, is_k_diffusion_version,
- is_librosa_available, is_note_seq_available, is_omegaconf_available,
- is_paddle_available, is_paddle_version, is_paddlenlp_available,
- is_paddlenlp_version, is_ppxformers_available, is_safetensors_available,
- is_scipy_available, is_tensorboard_available, is_torch_available,
- is_torch_version, is_unidecode_available, is_visualdl_available,
- is_wandb_available, requires_backends)
+ BACKENDS_MAPPING,
+ ENV_VARS_TRUE_AND_AUTO_VALUES,
+ ENV_VARS_TRUE_VALUES,
+ DummyObject,
+ OptionalDependencyNotAvailable,
+ is_bs4_available,
+ is_einops_available,
+ is_fastdeploy_available,
+ is_ftfy_available,
+ is_inflect_available,
+ is_k_diffusion_available,
+ is_k_diffusion_version,
+ is_librosa_available,
+ is_note_seq_available,
+ is_omegaconf_available,
+ is_paddle_available,
+ is_paddle_version,
+ is_paddlenlp_available,
+ is_paddlenlp_version,
+ is_ppxformers_available,
+ is_safetensors_available,
+ is_scipy_available,
+ is_tensorboard_available,
+ is_torch_available,
+ is_torch_version,
+ is_unidecode_available,
+ is_visualdl_available,
+ is_wandb_available,
+ requires_backends,
+)
+
# custom load_utils
from .load_utils import is_torch_file, safetensors_load, smart_load, torch_load
from .logging import get_logger
@@ -56,9 +101,21 @@
if is_paddle_available():
from .testing_utils import (
- floats_tensor, image_grid, load_hf_numpy, load_image, load_numpy,
- load_pd, load_ppnlp_numpy, nightly, paddle_all_close, paddle_device,
- parse_flag_from_env, print_tensor_test, require_paddle_gpu, slow)
+ floats_tensor,
+ image_grid,
+ load_hf_numpy,
+ load_image,
+ load_numpy,
+ load_pd,
+ load_ppnlp_numpy,
+ nightly,
+ paddle_all_close,
+ paddle_device,
+ parse_flag_from_env,
+ print_tensor_test,
+ require_paddle_gpu,
+ slow,
+ )
if is_torch_available():
from .testing_utils import require_torch
diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py
index 2a112f725dc0c..2e51e9e559395 100644
--- a/ppdiffusers/ppdiffusers/utils/constants.py
+++ b/ppdiffusers/ppdiffusers/utils/constants.py
@@ -31,9 +31,8 @@ def str2bool(v):
ppnlp_cache_home = os.path.expanduser(
- os.getenv("PPNLP_HOME",
- os.path.join(
- os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp")))
+ os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp"))
+)
ppdiffusers_default_cache_path = os.path.join(ppnlp_cache_home, "ppdiffusers")
# diffusers_default_cache_path = os.path.join(HUGGINGFACE_HUB_CACHE, "diffusers")
@@ -51,25 +50,20 @@ def str2bool(v):
DIFFUSERS_CACHE = diffusers_default_cache_path
DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
PPDIFFUSERS_DYNAMIC_MODULE_NAME = "ppdiffusers_modules"
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE",
- os.path.join(hf_cache_home, "modules"))
-PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE",
- os.path.join(ppnlp_cache_home, "modules"))
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules"))
PADDLE_WEIGHTS_NAME = "model_state.pdparams"
FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams"
FASTDEPLOY_MODEL_NAME = "inference.pdmodel"
WEIGHTS_NAME = PADDLE_WEIGHTS_NAME
-TEST_DOWNLOAD_SERVER = (
- "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests")
+TEST_DOWNLOAD_SERVER = "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests"
DOWNLOAD_SERVER = "https://bj.bcebos.com/paddlenlp/models/community"
-PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT",
- "https://bj.bcebos.com/paddlenlp")
+PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp")
DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
TEXT_ENCODER_ATTN_MODULE = ".self_attn"
-LOW_CPU_MEM_USAGE_DEFAULT = str2bool(
- os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False))
+LOW_CPU_MEM_USAGE_DEFAULT = str2bool(os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False))
NEG_INF = -1e4
@@ -87,5 +81,4 @@ def str2bool(v):
def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
print(x.tolist())
print(y.tolist())
- return raw_all_close(
- x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
+ return raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
diff --git a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
index 8207e2c77d07f..010f89e11386e 100644
--- a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
@@ -21,39 +21,38 @@
def deprecate(
- *args,
- take_from: Optional[Union[Dict, Any]]=None,
- standard_warn=True,
- stacklevel=2, ):
+ *args,
+ take_from: Optional[Union[Dict, Any]] = None,
+ standard_warn=True,
+ stacklevel=2,
+):
from ..version import VERSION as __version__
deprecated_kwargs = take_from
values = ()
if not isinstance(args[0], tuple):
- args = (args, )
+ args = (args,)
for attribute, version_name, message in args:
- if version.parse(version.parse(__version__)
- .base_version) >= version.parse(version_name):
+ if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
raise ValueError(
f"The deprecation tuple {(attribute, version_name, message)} should be removed since ppdiffusers'"
- f" version {__version__} is >= {version_name}")
+ f" version {__version__} is >= {version_name}"
+ )
warning = None
- if isinstance(deprecated_kwargs,
- dict) and attribute in deprecated_kwargs:
- values += (deprecated_kwargs.pop(attribute), )
+ if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
+ values += (deprecated_kwargs.pop(attribute),)
warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
elif hasattr(deprecated_kwargs, attribute):
- values += (getattr(deprecated_kwargs, attribute), )
+ values += (getattr(deprecated_kwargs, attribute),)
warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
elif deprecated_kwargs is None:
warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
if warning is not None:
warning = warning + " " if standard_warn else ""
- warnings.warn(
- warning + message, FutureWarning, stacklevel=stacklevel)
+ warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
call_frame = inspect.getouterframes(inspect.currentframe())[1]
@@ -61,9 +60,7 @@ def deprecate(
line_number = call_frame.lineno
function = call_frame.function
key, value = next(iter(deprecated_kwargs.items()))
- raise TypeError(
- f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`"
- )
+ raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
if len(values) == 0:
return
diff --git a/ppdiffusers/ppdiffusers/utils/doc_utils.py b/ppdiffusers/ppdiffusers/utils/doc_utils.py
index c8b3fe1ab24bc..01188c98e9152 100644
--- a/ppdiffusers/ppdiffusers/utils/doc_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/doc_utils.py
@@ -23,8 +23,7 @@ def docstring_decorator(fn):
func_doc = fn.__doc__
lines = func_doc.split("\n")
i = 0
- while i < len(lines) and re.search(r"^\s*Examples?:\s*$",
- lines[i]) is None:
+ while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
i += 1
if i < len(lines):
lines[i] = example_docstring
@@ -32,7 +31,8 @@ def docstring_decorator(fn):
else:
raise ValueError(
f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
- f"current docstring is:\n{func_doc}")
+ f"current docstring is:\n{func_doc}"
+ )
fn.__doc__ = func_doc
return fn
diff --git a/ppdiffusers/ppdiffusers/utils/download_utils.py b/ppdiffusers/ppdiffusers/utils/download_utils.py
index a65ba335b0282..2ef31e8ba396b 100644
--- a/ppdiffusers/ppdiffusers/utils/download_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/download_utils.py
@@ -28,8 +28,11 @@
from filelock import FileLock
from huggingface_hub import hf_hub_download
from huggingface_hub.file_download import _chmod_and_replace, http_get
-from huggingface_hub.utils import (EntryNotFoundError, RepositoryNotFoundError,
- RevisionNotFoundError)
+from huggingface_hub.utils import (
+ EntryNotFoundError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+)
from huggingface_hub.utils import tqdm as hf_tqdm
from packaging import version
from requests import HTTPError
@@ -37,14 +40,18 @@
from tqdm.contrib.concurrent import thread_map
from ..version import VERSION as __version__
-from .constants import (DEPRECATED_REVISION_ARGS,
- HUGGINGFACE_CO_RESOLVE_ENDPOINT, PPDIFFUSERS_CACHE,
- PPNLP_BOS_RESOLVE_ENDPOINT,
- TORCH_SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME)
+from .constants import (
+ DEPRECATED_REVISION_ARGS,
+ HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+ PPDIFFUSERS_CACHE,
+ PPNLP_BOS_RESOLVE_ENDPOINT,
+ TORCH_SAFETENSORS_WEIGHTS_NAME,
+ WEIGHTS_NAME,
+)
from .logging import get_logger
-def _add_variant(weights_name: str, variant: Optional[str]=None) -> str:
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
if variant is not None:
splits = weights_name.split(".")
splits = splits[:-1] + [variant] + splits[-1:]
@@ -55,36 +62,34 @@ def _add_variant(weights_name: str, variant: Optional[str]=None) -> str:
# https://github.com/huggingface/diffusers/blob/da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871/src/diffusers/utils/hub_utils.py#L246
def _get_model_file(
- pretrained_model_name_or_path,
- *,
- weights_name,
- subfolder,
- cache_dir,
- force_download=False,
- revision=None,
- proxies=None,
- resume_download=False,
- local_files_only=None,
- use_auth_token=None,
- user_agent=None,
- commit_hash=None,
- file_lock_timeout=-1,
- from_hf_hub=False, ):
+ pretrained_model_name_or_path,
+ *,
+ weights_name,
+ subfolder,
+ cache_dir,
+ force_download=False,
+ revision=None,
+ proxies=None,
+ resume_download=False,
+ local_files_only=None,
+ use_auth_token=None,
+ user_agent=None,
+ commit_hash=None,
+ file_lock_timeout=-1,
+ from_hf_hub=False,
+):
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
if os.path.isfile(pretrained_model_name_or_path):
return pretrained_model_name_or_path
elif os.path.isdir(pretrained_model_name_or_path):
- if os.path.isfile(
- os.path.join(pretrained_model_name_or_path, weights_name)):
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
# Load from a PyTorch checkpoint
- model_file = os.path.join(pretrained_model_name_or_path,
- weights_name)
+ model_file = os.path.join(pretrained_model_name_or_path, weights_name)
return model_file
elif subfolder is not None and os.path.isfile(
- os.path.join(pretrained_model_name_or_path, subfolder,
- weights_name)):
- model_file = os.path.join(pretrained_model_name_or_path, subfolder,
- weights_name)
+ os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+ ):
+ model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
return model_file
else:
raise EnvironmentError(
@@ -105,19 +110,20 @@ def _get_model_file(
use_auth_token=use_auth_token,
user_agent=user_agent,
file_lock_timeout=file_lock_timeout,
- commit_hash=commit_hash, )
+ commit_hash=commit_hash,
+ )
REPO_TYPES = ["model"]
DEFAULT_REVISION = "main"
# REPO_ID_SEPARATOR = "--"
REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
-PPDIFFUSERS_BOS_URL_TEMPLATE = (
- PPNLP_BOS_RESOLVE_ENDPOINT +
- "/{repo_type}/community/{repo_id}/{revision}/{filename}")
+PPDIFFUSERS_BOS_URL_TEMPLATE = PPNLP_BOS_RESOLVE_ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}"
ALLOW_PATTERNS_MAPPING = {
- "scheduler": ["scheduler_config.json", ],
+ "scheduler": [
+ "scheduler_config.json",
+ ],
"text_encoder": [
"model_state.pdparams",
"config.json",
@@ -190,12 +196,13 @@ def _get_model_file(
def ppdiffusers_bos_url(
- repo_id: str,
- filename: str,
- *,
- subfolder: Optional[str]=None,
- repo_type: Optional[str]=None,
- revision: Optional[str]=None, ) -> str:
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+) -> str:
if subfolder == "":
subfolder = None
if subfolder is not None:
@@ -212,9 +219,9 @@ def ppdiffusers_bos_url(
return PPDIFFUSERS_BOS_URL_TEMPLATE.format(
repo_type=repo_type,
repo_id=repo_id,
- revision=quote(
- revision, safe=""),
- filename=quote(filename), ).replace(f"/{DEFAULT_REVISION}/", "/")
+ revision=quote(revision, safe=""),
+ filename=quote(filename),
+ ).replace(f"/{DEFAULT_REVISION}/", "/")
def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
@@ -229,16 +236,17 @@ def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
def ppdiffusers_bos_download(
- repo_id: str,
- filename: str,
- *,
- subfolder: Optional[str]=None,
- repo_type: Optional[str]=None,
- revision: Optional[str]=None,
- cache_dir: Union[str, Path, None]=None,
- force_download: bool=False,
- resume_download: bool=False,
- file_lock_timeout: int=-1, ):
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ file_lock_timeout: int = -1,
+):
if cache_dir is None:
cache_dir = PPDIFFUSERS_CACHE
if revision is None:
@@ -256,12 +264,8 @@ def ppdiffusers_bos_download(
repo_type = REPO_TYPES[0]
if repo_type not in REPO_TYPES:
- raise ValueError(
- f"Invalid repo type: {repo_type}. Accepted repo types are:"
- f" {str(REPO_TYPES)}")
- storage_folder = os.path.join(
- cache_dir, repo_folder_name(
- repo_id=repo_id, repo_type=repo_type))
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are:" f" {str(REPO_TYPES)}")
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
os.makedirs(storage_folder, exist_ok=True)
# cross platform transcription of filename, to be used as a local file path.
@@ -275,8 +279,7 @@ def ppdiffusers_bos_download(
if os.path.exists(pointer_path) and not force_download:
return pointer_path
- url_to_download = ppdiffusers_bos_url(
- repo_id, filename, repo_type=repo_type, revision=revision)
+ url_to_download = ppdiffusers_bos_url(repo_id, filename, repo_type=repo_type, revision=revision)
blob_path = os.path.join(storage_folder, filename)
# Prevent parallel downloads of the same file with a lock.
@@ -312,10 +315,8 @@ def _resumable_file_manager():
resume_size = 0
else:
temp_file_manager = partial( # type: ignore
- tempfile.NamedTemporaryFile,
- mode="wb",
- dir=cache_dir,
- delete=False)
+ tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+ )
resume_size = 0
# Download to temporary file, then copy to cache dir once finished.
@@ -328,7 +329,8 @@ def _resumable_file_manager():
temp_file,
proxies=None,
resume_size=resume_size,
- headers=None, )
+ headers=None,
+ )
logger.info("storing %s in cache at %s", url_to_download, blob_path)
_chmod_and_replace(temp_file.name, blob_path)
@@ -341,12 +343,13 @@ def _resumable_file_manager():
def ppdiffusers_url_download(
- url_to_download: str,
- cache_dir: Union[str, Path, None]=None,
- filename: Optional[str]=None,
- force_download: bool=False,
- resume_download: bool=False,
- file_lock_timeout: int=-1, ):
+ url_to_download: str,
+ cache_dir: Union[str, Path, None] = None,
+ filename: Optional[str] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ file_lock_timeout: int = -1,
+):
if cache_dir is None:
cache_dir = PPDIFFUSERS_CACHE
if isinstance(cache_dir, Path):
@@ -386,10 +389,8 @@ def _resumable_file_manager():
resume_size = 0
else:
temp_file_manager = partial( # type: ignore
- tempfile.NamedTemporaryFile,
- mode="wb",
- dir=cache_dir,
- delete=False)
+ tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+ )
resume_size = 0
# Download to temporary file, then copy to cache dir once finished.
@@ -402,7 +403,8 @@ def _resumable_file_manager():
temp_file,
proxies=None,
resume_size=resume_size,
- headers=None, )
+ headers=None,
+ )
logger.info("storing %s in cache at %s", url_to_download, file_path)
_chmod_and_replace(temp_file.name, file_path)
@@ -414,28 +416,29 @@ def _resumable_file_manager():
def bos_hf_download(
- pretrained_model_name_or_path,
- *,
- filename,
- subfolder,
- cache_dir,
- force_download=False,
- revision=None,
- from_hf_hub=False,
- proxies=None,
- resume_download=False,
- local_files_only=None,
- use_auth_token=None,
- user_agent=None,
- file_lock_timeout=-1,
- commit_hash=None, ):
+ pretrained_model_name_or_path,
+ *,
+ filename,
+ subfolder,
+ cache_dir,
+ force_download=False,
+ revision=None,
+ from_hf_hub=False,
+ proxies=None,
+ resume_download=False,
+ local_files_only=None,
+ use_auth_token=None,
+ user_agent=None,
+ file_lock_timeout=-1,
+ commit_hash=None,
+):
if from_hf_hub:
# 1. First check if deprecated way of loading from branches is used
- if (revision in DEPRECATED_REVISION_ARGS and
- (filename == WEIGHTS_NAME or
- filename == TORCH_SAFETENSORS_WEIGHTS_NAME) and
- version.parse(version.parse(__version__).base_version) >=
- version.parse("0.17.0")):
+ if (
+ revision in DEPRECATED_REVISION_ARGS
+ and (filename == WEIGHTS_NAME or filename == TORCH_SAFETENSORS_WEIGHTS_NAME)
+ and version.parse(version.parse(__version__).base_version) >= version.parse("0.17.0")
+ ):
try:
model_file = hf_hub_download(
pretrained_model_name_or_path,
@@ -448,15 +451,18 @@ def bos_hf_download(
use_auth_token=use_auth_token,
user_agent=user_agent,
subfolder=subfolder,
- revision=revision or commit_hash, )
+ revision=revision or commit_hash,
+ )
warnings.warn(
f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
- FutureWarning, )
+ FutureWarning,
+ )
return model_file
except: # noqa: E722
warnings.warn(
f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(filename, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(filename, revision)}' so that the correct variant file can be added.",
- FutureWarning, )
+ FutureWarning,
+ )
# 2. Load model file as usual
try:
model_file = hf_hub_download(
@@ -470,7 +476,8 @@ def bos_hf_download(
use_auth_token=use_auth_token,
user_agent=user_agent,
subfolder=subfolder,
- revision=revision, )
+ revision=revision,
+ )
return model_file
except RepositoryNotFoundError:
@@ -478,7 +485,8 @@ def bos_hf_download(
f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
"listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
"token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
- "login`.")
+ "login`."
+ )
except RevisionNotFoundError:
raise EnvironmentError(
f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
@@ -486,9 +494,7 @@ def bos_hf_download(
f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
)
except EntryNotFoundError:
- raise EnvironmentError(
- f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
- )
+ raise EnvironmentError(f"{pretrained_model_name_or_path} does not appear to have a file named {filename}.")
except HTTPError as err:
raise EnvironmentError(
f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
@@ -506,7 +512,8 @@ def bos_hf_download(
f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
- f"containing a file named {filename}")
+ f"containing a file named {filename}"
+ )
except KeyboardInterrupt:
raise EnvironmentError(
"You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
@@ -521,7 +528,8 @@ def bos_hf_download(
resume_download=resume_download,
subfolder=subfolder,
revision=revision,
- file_lock_timeout=file_lock_timeout, )
+ file_lock_timeout=file_lock_timeout,
+ )
return model_file
except HTTPError as err:
raise EnvironmentError(
@@ -529,13 +537,15 @@ def bos_hf_download(
f"There was a specific connection error when trying to load '{pretrained_model_name_or_path}'! "
f"We couldn't connect to '{PPNLP_BOS_RESOLVE_ENDPOINT}' to load this model, couldn't find it "
f"in the cached files and it looks like '{pretrained_model_name_or_path}' is not the path to a "
- f"directory containing a file named '{filename}'.")
+ f"directory containing a file named '{filename}'."
+ )
except EnvironmentError:
raise EnvironmentError(
f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
f"'{PPNLP_BOS_RESOLVE_ENDPOINT}', make sure you don't have a local directory with the same name. "
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
- f"containing a file named '{filename}'")
+ f"containing a file named '{filename}'"
+ )
except KeyboardInterrupt:
raise EnvironmentError(
"You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
@@ -562,20 +572,21 @@ def url_file_exists(url: str) -> bool:
def ppdiffusers_bos_dir_download(
- repo_id: str,
- *,
- revision: Optional[str]=None,
- repo_type: Optional[str]=None,
- cache_dir: Union[str, Path, None]=None,
- force_download: bool=False,
- resume_download: bool=False,
- folder_names: Optional[Union[List[str], str]]=None,
- max_workers: int=1,
- tqdm_class: Optional[base_tqdm]=None,
- variant: Optional[str]=None,
- is_fastdeploy_model: Optional[str]=False,
- file_lock_timeout: int=-1,
- local_files_only: bool=False, ) -> str:
+ repo_id: str,
+ *,
+ revision: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ folder_names: Optional[Union[List[str], str]] = None,
+ max_workers: int = 1,
+ tqdm_class: Optional[base_tqdm] = None,
+ variant: Optional[str] = None,
+ is_fastdeploy_model: Optional[str] = False,
+ file_lock_timeout: int = -1,
+ local_files_only: bool = False,
+) -> str:
# update repo id must end with @fastdeploy
if is_fastdeploy_model and not repo_id.endswith("@fastdeploy"):
repo_id = f"{repo_id}@fastdeploy"
@@ -585,12 +596,9 @@ def ppdiffusers_bos_dir_download(
filtered_repo_files = [["model_index.json", None]]
for subfolder in folder_names:
- allow_patterns = ALLOW_PATTERNS_MAPPING.get(
- subfolder, ALLOW_PATTERNS_MAPPING["others"])
+ allow_patterns = ALLOW_PATTERNS_MAPPING.get(subfolder, ALLOW_PATTERNS_MAPPING["others"])
if is_fastdeploy_model:
- allow_patterns = [
- ap for ap in allow_patterns if "pdparams" not in ap
- ]
+ allow_patterns = [ap for ap in allow_patterns if "pdparams" not in ap]
allow_patterns.extend(["inference.pdiparams", "inference.pdmodel"])
for filename in allow_patterns:
need_to_check_no_variant_file = False
@@ -602,25 +610,31 @@ def ppdiffusers_bos_dir_download(
url = ppdiffusers_bos_url(
repo_id,
filename=filename,
- subfolder=subfolder, )
+ subfolder=subfolder,
+ )
if url_file_exists(url):
# exist file
- filtered_repo_files.append([
- filename,
- subfolder,
- ])
+ filtered_repo_files.append(
+ [
+ filename,
+ subfolder,
+ ]
+ )
else:
if need_to_check_no_variant_file:
url = ppdiffusers_bos_url(
repo_id,
filename=raw_filename,
- subfolder=subfolder, )
+ subfolder=subfolder,
+ )
if url_file_exists(url):
# exist file
- filtered_repo_files.append([
- raw_filename,
- subfolder,
- ])
+ filtered_repo_files.append(
+ [
+ raw_filename,
+ subfolder,
+ ]
+ )
def _inner_ppdiffusers_bos_download(repo_file_list):
filename, _subfolder = repo_file_list
@@ -633,7 +647,8 @@ def _inner_ppdiffusers_bos_download(repo_file_list):
revision=revision,
resume_download=resume_download,
force_download=force_download,
- file_lock_timeout=file_lock_timeout, )
+ file_lock_timeout=file_lock_timeout,
+ )
thread_map(
_inner_ppdiffusers_bos_download,
@@ -641,5 +656,6 @@ def _inner_ppdiffusers_bos_download(repo_file_list):
desc=f"Fetching {len(filtered_repo_files)} files",
max_workers=max_workers,
# User can use its own tqdm class or the default one from `huggingface_hub.utils`
- tqdm_class=tqdm_class or hf_tqdm, )
+ tqdm_class=tqdm_class or hf_tqdm,
+ )
return os.path.join(cache_dir, repo_id)
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
index cca1dbd1d7d0d..fcbc659ea253c 100644
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
+++ b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
@@ -225,8 +225,7 @@ def get_cosine_schedule_with_warmup(*args, **kwargs):
def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
- requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup,
- ["paddle"])
+ requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["paddle"])
def get_linear_schedule_with_warmup(*args, **kwargs):
diff --git a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
index c1da547b98964..574a504c2d775 100644
--- a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
@@ -26,14 +26,16 @@
from typing import Dict, Optional, Union
from urllib import request
-from huggingface_hub import (HfFolder, cached_download, hf_hub_download,
- model_info)
+from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info
-from . import (PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE,
- logging)
+from . import PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE, logging
-COMMUNITY_PIPELINES_URL = "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py"
-GITEE_COMMUNITY_PIPELINES_URL = "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+COMMUNITY_PIPELINES_URL = (
+ "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+)
+GITEE_COMMUNITY_PIPELINES_URL = (
+ "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -87,11 +89,9 @@ def get_relative_imports(module_file):
content = f.read()
# Imports of the form `import .xxx`
- relative_imports = re.findall(
- "^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
+ relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
# Imports of the form `from .xxx import yyy`
- relative_imports += re.findall(
- "^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
+ relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
# Unique-ify
return list(set(relative_imports))
@@ -116,9 +116,7 @@ def get_relative_import_files(module_file):
module_path = Path(module_file).parent
new_import_files = [str(module_path / m) for m in new_imports]
- new_import_files = [
- f for f in new_import_files if f not in all_relative_imports
- ]
+ new_import_files = [f for f in new_import_files if f not in all_relative_imports]
files_to_check = [f"{f}.py" for f in new_import_files]
no_change = len(new_import_files) == 0
@@ -137,8 +135,7 @@ def check_imports(filename):
# Imports of the form `import xxx`
imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
# Imports of the form `from xxx import yyy`
- imports += re.findall(
- "^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+ imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
# Only keep the top-level module
imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
@@ -187,29 +184,33 @@ def find_pipeline_class(loaded_module):
pipeline_class = None
for cls_name, cls in cls_members.items():
- if (cls_name != DiffusionPipeline.__name__ and
- issubclass(cls, DiffusionPipeline) and
- cls.__module__.split(".")[0] != "ppdiffusers"):
+ if (
+ cls_name != DiffusionPipeline.__name__
+ and issubclass(cls, DiffusionPipeline)
+ and cls.__module__.split(".")[0] != "ppdiffusers"
+ ):
if pipeline_class is not None:
raise ValueError(
f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
- f" {loaded_module}.")
+ f" {loaded_module}."
+ )
pipeline_class = cls
return pipeline_class
def get_cached_module_file(
- pretrained_model_name_or_path: Union[str, os.PathLike],
- module_file: str,
- cache_dir: Optional[Union[str, os.PathLike]]=None,
- force_download: bool=False,
- resume_download: bool=False,
- proxies: Optional[Dict[str, str]]=None,
- use_auth_token: Optional[Union[bool, str]]=None,
- revision: Optional[str]=None,
- local_files_only: bool=False, ):
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ module_file: str,
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ proxies: Optional[Dict[str, str]] = None,
+ use_auth_token: Optional[Union[bool, str]] = None,
+ revision: Optional[str] = None,
+ local_files_only: bool = False,
+):
"""
Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
Transformers module.
@@ -260,8 +261,7 @@ def get_cached_module_file(
# Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
- module_file_or_url = os.path.join(pretrained_model_name_or_path,
- module_file)
+ module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
if os.path.isfile(module_file_or_url):
resolved_module_file = module_file_or_url
@@ -273,8 +273,7 @@ def get_cached_module_file(
logger.info(f"Defaulting to main: {revision}.")
# community pipeline on GitHub
- github_url = COMMUNITY_PIPELINES_URL.format(
- revision=revision, pipeline=pretrained_model_name_or_path)
+ github_url = COMMUNITY_PIPELINES_URL.format(revision=revision, pipeline=pretrained_model_name_or_path)
try:
resolved_module_file = cached_download(
github_url,
@@ -283,13 +282,12 @@ def get_cached_module_file(
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
- use_auth_token=False, )
+ use_auth_token=False,
+ )
submodule = "git"
module_file = pretrained_model_name_or_path + ".py"
except EnvironmentError:
- logger.error(
- f"Could not locate the {module_file} inside {pretrained_model_name_or_path}."
- )
+ logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
raise
else:
try:
@@ -302,13 +300,11 @@ def get_cached_module_file(
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
- use_auth_token=use_auth_token, )
- submodule = os.path.join(
- "local", "--".join(pretrained_model_name_or_path.split("/")))
- except EnvironmentError:
- logger.error(
- f"Could not locate the {module_file} inside {pretrained_model_name_or_path}."
+ use_auth_token=use_auth_token,
)
+ submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
+ except EnvironmentError:
+ logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
raise
# Check we have all the requirements in our environment
@@ -327,7 +323,8 @@ def get_cached_module_file(
module_needed = f"{module_needed}.py"
shutil.copy(
os.path.join(pretrained_model_name_or_path, module_needed),
- submodule_path / module_needed, )
+ submodule_path / module_needed,
+ )
else:
# Get the commit hash
# TODO: we will get this info in the etag soon, so retrieve it from there and not here.
@@ -338,8 +335,7 @@ def get_cached_module_file(
else:
token = None
- commit_hash = model_info(
- pretrained_model_name_or_path, revision=revision, token=token).sha
+ commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
# The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
# benefit of versioning.
@@ -361,22 +357,24 @@ def get_cached_module_file(
proxies=proxies,
use_auth_token=use_auth_token,
revision=revision,
- local_files_only=local_files_only, )
+ local_files_only=local_files_only,
+ )
return os.path.join(full_submodule, module_file)
def get_class_from_dynamic_module(
- pretrained_model_name_or_path: Union[str, os.PathLike],
- module_file: str,
- class_name: Optional[str]=None,
- cache_dir: Optional[Union[str, os.PathLike]]=None,
- force_download: bool=False,
- resume_download: bool=False,
- proxies: Optional[Dict[str, str]]=None,
- use_auth_token: Optional[Union[bool, str]]=None,
- revision: Optional[str]=None,
- local_files_only: bool=False,
- **kwargs, ):
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ module_file: str,
+ class_name: Optional[str] = None,
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ proxies: Optional[Dict[str, str]] = None,
+ use_auth_token: Optional[Union[bool, str]] = None,
+ revision: Optional[str] = None,
+ local_files_only: bool = False,
+ **kwargs,
+):
"""
Extracts a class from a module file, present in the local folder or repository of a model.
@@ -449,5 +447,6 @@ def get_class_from_dynamic_module(
proxies=proxies,
use_auth_token=use_auth_token,
revision=revision,
- local_files_only=local_files_only, )
+ local_files_only=local_files_only,
+ )
return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/ppdiffusers/ppdiffusers/utils/hub_utils.py b/ppdiffusers/ppdiffusers/utils/hub_utils.py
index 391c8099f0b30..8de82f5ab9800 100644
--- a/ppdiffusers/ppdiffusers/utils/hub_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/hub_utils.py
@@ -28,8 +28,14 @@
from ..version import VERSION as __version__
from .constants import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT
from .import_utils import (
- ENV_VARS_TRUE_VALUES, _fastdeploy_version, _paddle_version, _torch_version,
- is_fastdeploy_available, is_paddle_available, is_torch_available)
+ ENV_VARS_TRUE_VALUES,
+ _fastdeploy_version,
+ _paddle_version,
+ _torch_version,
+ is_fastdeploy_available,
+ is_paddle_available,
+ is_torch_available,
+)
from .logging import get_logger
logger = get_logger(__name__)
@@ -37,12 +43,11 @@
MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
SESSION_ID = uuid4().hex
HF_HUB_OFFLINE = os.getenv("HF_HUB_OFFLINE", "").upper() in ENV_VARS_TRUE_VALUES
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY",
- "").upper() in ENV_VARS_TRUE_VALUES
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
HUGGINGFACE_CO_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/"
-def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str:
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
"""
Formats a user-agent string with basic info about a request.
"""
@@ -65,9 +70,7 @@ def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str:
return ua
-def get_full_repo_name(model_id: str,
- organization: Optional[str]=None,
- token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
@@ -82,7 +85,8 @@ def create_model_card(args, model_name):
raise ValueError(
"Modelcard rendering is based on Jinja templates."
" Please make sure to have `jinja` installed before using `create_model_card`."
- " To install it, please run `pip install Jinja2`.")
+ " To install it, please run `pip install Jinja2`."
+ )
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
return
@@ -97,41 +101,35 @@ def create_model_card(args, model_name):
library_name="ppdiffusers",
tags=[],
datasets=args.dataset_name,
- metrics=[], ),
+ metrics=[],
+ ),
template_path=MODEL_CARD_TEMPLATE_PATH,
model_name=model_name,
repo_name=repo_name,
- dataset_name=args.dataset_name
- if hasattr(args, "dataset_name") else None,
+ dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None,
learning_rate=args.learning_rate,
train_batch_size=args.train_batch_size,
eval_batch_size=args.eval_batch_size,
gradient_accumulation_steps=(
- args.gradient_accumulation_steps
- if hasattr(args, "gradient_accumulation_steps") else None),
+ args.gradient_accumulation_steps if hasattr(args, "gradient_accumulation_steps") else None
+ ),
adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None,
adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None,
- adam_weight_decay=args.adam_weight_decay
- if hasattr(args, "adam_weight_decay") else None,
- adam_epsilon=args.adam_epsilon
- if hasattr(args, "adam_epsilon") else None,
- lr_scheduler=args.lr_scheduler
- if hasattr(args, "lr_scheduler") else None,
- lr_warmup_steps=args.lr_warmup_steps
- if hasattr(args, "lr_warmup_steps") else None,
- ema_inv_gamma=args.ema_inv_gamma
- if hasattr(args, "ema_inv_gamma") else None,
+ adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None,
+ adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None,
+ lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None,
+ lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None,
+ ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None,
ema_power=args.ema_power if hasattr(args, "ema_power") else None,
- ema_max_decay=args.ema_max_decay
- if hasattr(args, "ema_max_decay") else None,
- mixed_precision=args.mixed_precision, )
+ ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None,
+ mixed_precision=args.mixed_precision,
+ )
card_path = os.path.join(args.output_dir, "README.md")
model_card.save(card_path)
-def extract_commit_hash(resolved_file: Optional[str],
- commit_hash: Optional[str]=None):
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
"""
Extracts the commit hash from a resolved filename toward a cache file.
"""
@@ -150,14 +148,12 @@ def extract_commit_hash(resolved_file: Optional[str],
# - Diffusers doesn't use custom environment variables to specify the cache path.
# - There is no need to migrate the cache format, just move the files to the new location.
hf_cache_home = os.path.expanduser(
- os.getenv("HF_HOME",
- os.path.join(
- os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")))
+ os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")
-def move_cache(old_cache_dir: Optional[str]=None,
- new_cache_dir: Optional[str]=None) -> None:
+def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
if new_cache_dir is None:
new_cache_dir = DIFFUSERS_CACHE
if old_cache_dir is None:
@@ -168,8 +164,7 @@ def move_cache(old_cache_dir: Optional[str]=None,
# move file blob by blob
for old_blob_path in old_cache_dir.glob("**/blobs/*"):
if old_blob_path.is_file() and not old_blob_path.is_symlink():
- new_blob_path = new_cache_dir / old_blob_path.relative_to(
- old_cache_dir)
+ new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
new_blob_path.parent.mkdir(parents=True, exist_ok=True)
os.replace(old_blob_path, new_blob_path)
try:
@@ -182,8 +177,7 @@ def move_cache(old_cache_dir: Optional[str]=None,
# At this point, old_cache_dir contains symlinks to the new cache (it can still be used).
-cache_version_file = os.path.join(DIFFUSERS_CACHE,
- "version_diffusers_cache.txt")
+cache_version_file = os.path.join(DIFFUSERS_CACHE, "version_diffusers_cache.txt")
if not os.path.isfile(cache_version_file):
cache_version = 0
else:
@@ -194,13 +188,13 @@ def move_cache(old_cache_dir: Optional[str]=None,
cache_version = 0
if cache_version < 1:
- old_cache_is_not_empty = (os.path.isdir(old_diffusers_cache) and
- len(os.listdir(old_diffusers_cache)) > 0)
+ old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
if old_cache_is_not_empty:
logger.warning(
"The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
"existing cached models. This is a one-time operation, you can interrupt it or run it "
- "later by calling `diffusers.utils.hub_utils.move_cache()`.")
+ "later by calling `diffusers.utils.hub_utils.move_cache()`."
+ )
try:
move_cache()
except Exception as e:
@@ -208,7 +202,8 @@ def move_cache(old_cache_dir: Optional[str]=None,
logger.error(
f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
"file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
- "message and we will do our best to help.")
+ "message and we will do our best to help."
+ )
if cache_version < 1:
try:
@@ -218,4 +213,5 @@ def move_cache(old_cache_dir: Optional[str]=None,
except Exception:
logger.warning(
f"There was a problem when trying to write in your cache folder ({DIFFUSERS_CACHE}). Please, ensure "
- "the directory exists and can be written to.")
+ "the directory exists and can be written to."
+ )
diff --git a/ppdiffusers/ppdiffusers/utils/import_utils.py b/ppdiffusers/ppdiffusers/utils/import_utils.py
index 577f4d0dc3498..cc43592be9962 100644
--- a/ppdiffusers/ppdiffusers/utils/import_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/import_utils.py
@@ -64,8 +64,9 @@
if _paddle_available:
try:
- from paddle.incubate.nn.memory_efficient_attention import \
- memory_efficient_attention # noqa
+ from paddle.incubate.nn.memory_efficient_attention import ( # noqa
+ memory_efficient_attention,
+ )
_ppxformers_available = True
except ImportError:
@@ -90,8 +91,7 @@
if _safetensors_available:
try:
_safetensors_version = importlib_metadata.version("safetensors")
- logger.info(
- f"Safetensors version {_safetensors_version} available.")
+ logger.info(f"Safetensors version {_safetensors_version} available.")
except importlib_metadata.PackageNotFoundError:
_safetensors_available = False
else:
@@ -101,8 +101,7 @@
_transformers_available = importlib.util.find_spec("transformers") is not None
try:
_transformers_version = importlib_metadata.version("transformers")
- logger.debug(
- f"Successfully imported transformers version {_transformers_version}")
+ logger.debug(f"Successfully imported transformers version {_transformers_version}")
except importlib_metadata.PackageNotFoundError:
_transformers_available = False
@@ -116,8 +115,7 @@
_unidecode_available = importlib.util.find_spec("unidecode") is not None
try:
_unidecode_version = importlib_metadata.version("unidecode")
- logger.debug(
- f"Successfully imported unidecode version {_unidecode_version}")
+ logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
except importlib_metadata.PackageNotFoundError:
_unidecode_available = False
@@ -134,14 +132,12 @@
pass
_fastdeploy_available = _fastdeploy_version != "N/A"
if _fastdeploy_available:
- logger.debug(
- f"Successfully imported fastdeploy version {_fastdeploy_version}")
+ logger.debug(f"Successfully imported fastdeploy version {_fastdeploy_version}")
_paddlenlp_available = importlib.util.find_spec("paddlenlp") is not None
try:
_paddlenlp_version = importlib_metadata.version("paddlenlp")
- logger.debug(
- f"Successfully imported paddlenlp version {_paddlenlp_version}")
+ logger.debug(f"Successfully imported paddlenlp version {_paddlenlp_version}")
except importlib_metadata.PackageNotFoundError:
_paddlenlp_available = False
@@ -152,7 +148,8 @@
"opencv-python",
"opencv-contrib-python",
"opencv-python-headless",
- "opencv-contrib-python-headless", )
+ "opencv-contrib-python-headless",
+ )
_opencv_version = None
for pkg in candidates:
try:
@@ -183,8 +180,7 @@
_k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None
try:
_k_diffusion_version = importlib_metadata.version("k_diffusion")
- logger.debug(
- f"Successfully imported k-diffusion version {_k_diffusion_version}")
+ logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}")
except importlib_metadata.PackageNotFoundError:
_k_diffusion_available = False
@@ -205,16 +201,14 @@
_omegaconf_available = importlib.util.find_spec("omegaconf") is not None
try:
_omegaconf_version = importlib_metadata.version("omegaconf")
- logger.debug(
- f"Successfully imported omegaconf version {_omegaconf_version}")
+ logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}")
except importlib_metadata.PackageNotFoundError:
_omegaconf_available = False
_tensorboard_available = importlib.util.find_spec("tensorboard")
try:
_tensorboard_version = importlib_metadata.version("tensorboard")
- logger.debug(
- f"Successfully imported tensorboard version {_tensorboard_version}")
+ logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}")
except importlib_metadata.PackageNotFoundError:
_tensorboard_available = False
@@ -232,8 +226,7 @@
import einops.layers.paddle
einops.layers.paddle
- logger.debug(
- f"Successfully imported einops version {einops.__version__}")
+ logger.debug(f"Successfully imported einops version {einops.__version__}")
except ImportError:
_einops_available = False
except importlib_metadata.PackageNotFoundError:
@@ -482,27 +475,29 @@ def is_bs4_available():
that match your environment. Please note that you may need to restart your runtime after installation.
"""
-BACKENDS_MAPPING = OrderedDict([
- ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
- ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)),
- ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
- ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)),
- ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)),
- ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
- ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
- ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
- ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
- ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
- ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
- ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
- ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
- ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
- ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
- ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)),
- ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
- ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
- ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
-])
+BACKENDS_MAPPING = OrderedDict(
+ [
+ ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+ ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)),
+ ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
+ ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)),
+ ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)),
+ ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
+ ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
+ ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+ ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+ ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+ ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+ ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
+ ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
+ ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
+ ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
+ ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)),
+ ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
+ ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
+ ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+ ]
+)
def requires_backends(obj, backends):
@@ -516,26 +511,24 @@ def requires_backends(obj, backends):
raise ImportError("".join(failed))
if name in [
- "VersatileDiffusionTextToImagePipeline",
- "VersatileDiffusionPipeline",
- "VersatileDiffusionDualGuidedPipeline",
- "StableDiffusionImageVariationPipeline",
- "UnCLIPPipeline",
+ "VersatileDiffusionTextToImagePipeline",
+ "VersatileDiffusionPipeline",
+ "VersatileDiffusionDualGuidedPipeline",
+ "StableDiffusionImageVariationPipeline",
+ "UnCLIPPipeline",
] and is_paddlenlp_version("<", "2.5.0"):
raise ImportError(
f"You need to install `paddlenlp>=2.5.0` in order to use {name}: \n```\n pip install"
- " --upgrade paddlenlp \n```")
+ " --upgrade paddlenlp \n```"
+ )
- if name in [
- "StableDiffusionDepth2ImgPipeline",
- "StableDiffusionPix2PixZeroPipeline",
- ] and is_paddlenlp_version(
- "<",
- "2.5.1" # TODO version
+ if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_paddlenlp_version(
+ "<", "2.5.1" # TODO version
):
raise ImportError(
f"You need to install `paddlenlp>=2.5.1` in order to use {name}: \n```\n pip install"
- " --upgrade paddlenlp \n```")
+ " --upgrade paddlenlp \n```"
+ )
class DummyObject(type):
@@ -551,9 +544,7 @@ def __getattr__(cls, key):
# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
-def compare_versions(library_or_version: Union[str, Version],
- operation: str,
- requirement_version: str):
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
"""
Args:
Compares a library version to some requirement using a given operation.
@@ -565,13 +556,10 @@ def compare_versions(library_or_version: Union[str, Version],
The version to compare the library version against
"""
if operation not in STR_OPERATION_TO_FUNC.keys():
- raise ValueError(
- f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}"
- )
+ raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
operation = STR_OPERATION_TO_FUNC[operation]
if isinstance(library_or_version, str):
- library_or_version = parse(
- importlib_metadata.version(library_or_version))
+ library_or_version = parse(importlib_metadata.version(library_or_version))
return operation(library_or_version, parse(requirement_version))
diff --git a/ppdiffusers/ppdiffusers/utils/initializer_utils.py b/ppdiffusers/ppdiffusers/utils/initializer_utils.py
index 9c71cc89861c9..263c7c41a030f 100644
--- a/ppdiffusers/ppdiffusers/utils/initializer_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/initializer_utils.py
@@ -46,9 +46,7 @@ def _no_grad_uniform_(tensor, a, b):
def _no_grad_normal_(tensor, mean=0.0, std=1.0):
with paddle.no_grad():
- tensor.copy_(
- paddle.normal(
- mean=mean, std=std, shape=tensor.shape), True)
+ tensor.copy_(paddle.normal(mean=mean, std=std, shape=tensor.shape), True)
return tensor
@@ -134,9 +132,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False):
Tuple[fan_in, fan_out]
"""
if tensor.ndim < 2:
- raise ValueError(
- "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
- )
+ raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
if reverse:
num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
@@ -189,8 +185,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False):
mode = mode.lower()
valid_modes = ["fan_in", "fan_out"]
if mode not in valid_modes:
- raise ValueError("Mode {} not supported, please use one of {}".format(
- mode, valid_modes))
+ raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
@@ -216,13 +211,11 @@ def _calculate_gain(nonlinearity, param=None):
elif nonlinearity == "leaky_relu":
if param is None:
negative_slope = 0.01
- elif (not isinstance(param, bool) and isinstance(param, int) or
- isinstance(param, float)):
+ elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
# True/False are instances of int, hence check above
negative_slope = param
else:
- raise ValueError("negative_slope {} not a valid number".format(
- param))
+ raise ValueError("negative_slope {} not a valid number".format(param))
return math.sqrt(2.0 / (1 + negative_slope**2))
elif nonlinearity == "selu":
return 3.0 / 4
@@ -230,11 +223,7 @@ def _calculate_gain(nonlinearity, param=None):
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
-def kaiming_uniform_(tensor,
- a=0,
- mode="fan_in",
- nonlinearity="leaky_relu",
- reverse=False):
+def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
"""
Modified tensor inspace using kaiming_uniform method
Args:
@@ -252,11 +241,7 @@ def kaiming_uniform_(tensor,
return _no_grad_uniform_(tensor, -k, k)
-def kaiming_normal_(tensor,
- a=0,
- mode="fan_in",
- nonlinearity="leaky_relu",
- reverse=False):
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
"""
Modified tensor inspace using kaiming_normal_
Args:
@@ -304,8 +289,7 @@ def reset_initialized_parameter(model, include_self=True):
"""
for _, m in model.named_sublayers(include_self=include_self):
if isinstance(m, nn.Conv2D):
- k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
- m._kernel_size[1])
+ k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
k = math.sqrt(k)
_no_grad_uniform_(m.weight, -k, k)
if hasattr(m, "bias") and getattr(m, "bias") is not None:
@@ -330,17 +314,17 @@ def reset_initialized_parameter(model, include_self=True):
class Init:
def __init__(self):
for init_func in [
- uniform_,
- normal_,
- constant_,
- ones_,
- zeros_,
- xavier_uniform_,
- xavier_normal_,
- kaiming_uniform_,
- kaiming_normal_,
- linear_init_,
- conv_init_,
+ uniform_,
+ normal_,
+ constant_,
+ ones_,
+ zeros_,
+ xavier_uniform_,
+ xavier_normal_,
+ kaiming_uniform_,
+ kaiming_normal_,
+ linear_init_,
+ conv_init_,
]:
setattr(self, init_func.__name__, init_func)
diff --git a/ppdiffusers/ppdiffusers/utils/load_utils.py b/ppdiffusers/ppdiffusers/utils/load_utils.py
index 023551a27ce6d..a1602c8862d80 100644
--- a/ppdiffusers/ppdiffusers/utils/load_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/load_utils.py
@@ -24,8 +24,11 @@
import numpy as np
from .constants import get_map_location_default
-from .import_utils import (is_paddle_available, is_safetensors_available,
- is_torch_available)
+from .import_utils import (
+ is_paddle_available,
+ is_safetensors_available,
+ is_torch_available,
+)
from .logging import get_logger
logger = get_logger(__name__)
@@ -68,8 +71,7 @@ def read_prefix_key(path):
with open(path, "rb") as file_handler:
end_index = seek_by_string(file_handler, "data.pkl", file_size)
file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
- prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE
- - len("/data.pkl"))
+ prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl"))
return prefix_key.decode("latin")
@@ -89,8 +91,7 @@ def seek_by_string(file_handler, string: str, file_size: int) -> int:
word_index = 0
if file_handler.tell() >= file_size - 1:
- raise Exception(
- f"can't find the find the target string<{string}> in the file")
+ raise Exception(f"can't find the find the target string<{string}> in the file")
return file_handler.tell()
@@ -163,21 +164,18 @@ def find_class(self, mod_name, name):
return super().find_class(mod_name, name)
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
- backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
# if a tensor has shape [M, N] and stride is [1, N], it's column-wise / fortran-style
# if a tensor has shape [M, N] and stride is [M, 1], it's row-wise / C-style
# defautls to C-style
- if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[
- 1] > 1:
+ if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[1] > 1:
order = "F"
else:
order = "C"
# fix bug when load https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
numel = int(np.prod(size))
- return storage[storage_offset:storage_offset + numel].reshape(
- size, order=order)
+ return storage[storage_offset : storage_offset + numel].reshape(size, order=order)
def _rebuild_parameter(data, requires_grad, backward_hooks):
@@ -207,8 +205,7 @@ def torch_load(path: str, **pickle_load_args):
def load_tensor(dtype, numel, key, location):
name = f"{prefix_key}/data/{key}"
- typed_storage = np.frombuffer(
- torch_zip.open(name).read()[:numel], dtype=dtype)
+ typed_storage = np.frombuffer(torch_zip.open(name).read()[:numel], dtype=dtype)
return typed_storage
def persistent_load(saved_id):
@@ -226,15 +223,13 @@ def persistent_load(saved_id):
typed_storage = loaded_storages[key]
else:
nbytes = numel * _element_size(dtype)
- typed_storage = load_tensor(dtype, nbytes, key,
- _maybe_decode_ascii(location))
+ typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
loaded_storages[key] = typed_storage
return typed_storage
data_iostream = torch_zip.open(f"{prefix_key}/data.pkl").read()
- unpickler_stage = UnpicklerWrapperStage(
- io.BytesIO(data_iostream), **pickle_load_args)
+ unpickler_stage = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
unpickler_stage.persistent_load = persistent_load
state_dict = unpickler_stage.load()
torch_zip.close()
@@ -263,19 +258,18 @@ def convert_to_paddle(state_dict, return_numpy=False, return_global_step=False):
# if "position_id" in k and "int" not in str(v.dtype):
# v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
if v.ndim == 0:
- v = v.reshape((1, ))
+ v = v.reshape((1,))
if not return_numpy:
# support bfloat16
if "torch.bfloat16" in str(v.dtype):
v = v.float()
pd_state_dict[k] = (
paddle.to_tensor(v.numpy()).cast(paddle.bfloat16)
- if hasattr(v, "numpy") else
- paddle.to_tensor(v).cast(paddle.bfloat16))
+ if hasattr(v, "numpy")
+ else paddle.to_tensor(v).cast(paddle.bfloat16)
+ )
else:
- pd_state_dict[k] = (paddle.to_tensor(v.numpy())
- if hasattr(v, "numpy") else
- paddle.to_tensor(v))
+ pd_state_dict[k] = paddle.to_tensor(v.numpy()) if hasattr(v, "numpy") else paddle.to_tensor(v)
else:
pd_state_dict[k] = v.numpy() if hasattr(v, "numpy") else v
@@ -290,7 +284,7 @@ def convert_to_numpy(state_dict):
# if "position_id" in k and "int" not in str(v.dtype):
# v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
if v.ndim == 0:
- v = v.reshape((1, ))
+ v = v.reshape((1,))
return pd_state_dict
@@ -310,19 +304,18 @@ def safetensors_load(path: str):
data = load_file(path)
else:
- raise ImportError(
- "`safetensors_load` requires the `safetensors library: `pip install safetensors`."
- )
+ raise ImportError("`safetensors_load` requires the `safetensors library: `pip install safetensors`.")
return data
def smart_load(
- path: str,
- map_location: str=None,
- return_numpy: bool=False,
- return_global_step: bool=False,
- return_is_torch_weight: bool=False, ):
+ path: str,
+ map_location: str = None,
+ return_numpy: bool = False,
+ return_global_step: bool = False,
+ return_is_torch_weight: bool = False,
+):
if map_location is None:
map_location = get_map_location_default()
@@ -335,46 +328,36 @@ def smart_load(
return state_dict
if suffix in torch_suffix:
- state_dict = convert_to_paddle(
- torch_load(path), return_numpy, return_global_step)
+ state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
if return_is_torch_weight:
state_dict["is_torch_weight"] = True
return state_dict
if suffix in safetensors_suffix:
- state_dict = convert_to_paddle(
- safetensors_load(path), return_numpy, return_global_step)
+ state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
if return_is_torch_weight:
state_dict["is_torch_weight"] = True
return state_dict
# must use safetensors_load first
try:
- state_dict = convert_to_paddle(
- safetensors_load(path), return_numpy, return_global_step)
+ state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
if return_is_torch_weight:
state_dict["is_torch_weight"] = True
return state_dict
except Exception:
logger.info(f"Cant load file {name} with safetensors!")
try:
- state_dict = convert_to_paddle(
- torch_load(path), return_numpy, return_global_step)
+ state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
if return_is_torch_weight:
state_dict["is_torch_weight"] = True
return state_dict
except Exception:
- logger.info(
- f"Cant load file {name} with torch! We will try to load this with safetensors!"
- )
+ logger.info(f"Cant load file {name} with torch! We will try to load this with safetensors!")
try:
state_dict = paddle.load(path, return_numpy=return_numpy)
return state_dict
except Exception:
- logger.info(
- f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!"
- )
+ logger.info(f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!")
if state_dict is None:
- raise ValueError(
- f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!"
- )
+ raise ValueError(f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!")
diff --git a/ppdiffusers/ppdiffusers/utils/logging.py b/ppdiffusers/ppdiffusers/utils/logging.py
index 355cb16bd50bd..12b12c075d2ef 100644
--- a/ppdiffusers/ppdiffusers/utils/logging.py
+++ b/ppdiffusers/ppdiffusers/utils/logging.py
@@ -58,7 +58,8 @@ def _get_default_logging_level():
else:
logging.getLogger().warning(
f"Unknown option PPDIFFUSERS_VERBOSITY={env_level_str}, "
- f"has to be one of: { ', '.join(log_levels.keys()) }")
+ f"has to be one of: { ', '.join(log_levels.keys()) }"
+ )
return _default_log_level
@@ -104,7 +105,7 @@ def get_log_levels_dict():
return log_levels
-def get_logger(name: Optional[str]=None) -> logging.Logger:
+def get_logger(name: Optional[str] = None) -> logging.Logger:
"""
Return a logger with the specified name.
@@ -212,8 +213,7 @@ def remove_handler(handler: logging.Handler) -> None:
_configure_library_root_logger()
- assert handler is not None and handler not in _get_library_root_logger(
- ).handlers
+ assert handler is not None and handler not in _get_library_root_logger().handlers
_get_library_root_logger().removeHandler(handler)
@@ -247,8 +247,7 @@ def enable_explicit_format() -> None:
handlers = _get_library_root_logger().handlers
for handler in handlers:
- formatter = logging.Formatter(
- "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+ formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
handler.setFormatter(formatter)
diff --git a/ppdiffusers/ppdiffusers/utils/outputs.py b/ppdiffusers/ppdiffusers/utils/outputs.py
index cd319b7378749..b71ef22559c47 100644
--- a/ppdiffusers/ppdiffusers/utils/outputs.py
+++ b/ppdiffusers/ppdiffusers/utils/outputs.py
@@ -60,8 +60,7 @@ def __post_init__(self):
raise ValueError(f"{self.__class__.__name__} has no fields.")
first_field = getattr(self, class_fields[0].name)
- other_fields_are_none = all(
- getattr(self, field.name) is None for field in class_fields[1:])
+ other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
if other_fields_are_none and isinstance(first_field, dict):
for key, value in first_field.items():
@@ -73,23 +72,16 @@ def __post_init__(self):
self[field.name] = v
def __delitem__(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
def setdefault(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
def pop(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+ raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
def update(self, *args, **kwargs):
- raise Exception(
- f"You cannot use ``update`` on a {self.__class__.__name__} instance."
- )
+ raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
def __getitem__(self, k):
if isinstance(k, str):
@@ -121,6 +113,6 @@ def to_tuple(self) -> Tuple[Any]:
for field in fields(self):
if getattr(self, field.name, None) is None:
continue
- tuples = tuples + (getattr(self, field.name), )
+ tuples = tuples + (getattr(self, field.name),)
return tuples
diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
index a59bfd24f7166..1fa9da783471b 100644
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
@@ -43,8 +43,7 @@ def manual_seed(self, seed, generator_name=None):
if generator_name is None:
generator_name = str(time.time())
if generator_name in self.states_:
- raise ValueError("state {} already exists".format(
- generator_name))
+ raise ValueError("state {} already exists".format(generator_name))
orig_rng_state = paddle.get_cuda_rng_state()
paddle.seed(seed)
self.states_[generator_name] = paddle.get_cuda_rng_state()
@@ -55,8 +54,7 @@ def manual_seed(self, seed, generator_name=None):
def rng_state(self, generator_name=None):
if generator_name is not None:
if generator_name not in self.states_:
- raise ValueError("state {} does not exist".format(
- generator_name))
+ raise ValueError("state {} does not exist".format(generator_name))
orig_cuda_rng_state = paddle.get_cuda_rng_state()
paddle.set_cuda_rng_state(self.states_[generator_name])
try:
@@ -81,16 +79,13 @@ def get_rng_state_tracker(*args, **kwargs):
@paddle.jit.not_to_static
def randn_pt(shape, dtype=None, name=None, **kwargs):
generator = kwargs.get("generator", None)
- is_bfloat16 = ("bfloat16" in str(dtype) or
- "bfloat16" in paddle.get_default_dtype())
+ is_bfloat16 = "bfloat16" in str(dtype) or "bfloat16" in paddle.get_default_dtype()
if is_bfloat16:
if generator is None:
- return randn(
- shape, dtype="float16", name=name).cast(paddle.bfloat16)
+ return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
else:
with get_rng_state_tracker().rng_state(generator):
- return randn(
- shape, dtype="float16", name=name).cast(paddle.bfloat16)
+ return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
else:
if generator is None:
return randn(shape, dtype=dtype, name=name)
@@ -108,24 +103,20 @@ def rand_pt(shape, dtype=None, name=None, **kwargs):
return rand(shape, dtype=dtype, name=name)
@paddle.jit.not_to_static
- def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None,
- **kwargs):
+ def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None, **kwargs):
generator = kwargs.get("generator", None)
if generator is None:
- return randint(
- low=low, high=high, shape=shape, dtype=dtype, name=name)
+ return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
else:
with get_rng_state_tracker().rng_state(generator):
- return randint(
- low=low, high=high, shape=shape, dtype=dtype, name=name)
+ return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
@paddle.jit.not_to_static
def randn_like_pt(x, dtype=None, name=None, **kwargs):
generator = kwargs.get("generator", None)
if dtype is None:
dtype = x.dtype
- return randn_pt(
- x.shape, dtype=dtype, generator=generator, name=name, **kwargs)
+ return randn_pt(x.shape, dtype=dtype, generator=generator, name=name, **kwargs)
paddle.randn = randn_pt
paddle.rand = rand_pt
@@ -133,23 +124,19 @@ def randn_like_pt(x, dtype=None, name=None, **kwargs):
paddle.randn_like = randn_like_pt
def randn_tensor(
- shape: Union[Tuple, List],
- generator: Optional[Union[List["paddle.Generator"],
- "paddle.Generator"]]=None,
- dtype: Optional["paddle.dtype"]=None,
- *kwargs, ):
+ shape: Union[Tuple, List],
+ generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
+ dtype: Optional["paddle.dtype"] = None,
+ *kwargs,
+ ):
"""This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
will always be created on CPU.
"""
if isinstance(generator, (list, tuple)):
batch_size = shape[0]
- shape = (1, ) + tuple(shape[1:])
- latents = [
- randn_pt(
- shape, generator=generator[i], dtype=dtype)
- for i in range(batch_size)
- ]
+ shape = (1,) + tuple(shape[1:])
+ latents = [randn_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
latents = paddle.concat(latents, axis=0)
else:
latents = randn_pt(shape, generator=generator, dtype=dtype)
@@ -157,23 +144,19 @@ def randn_tensor(
return latents
def rand_tensor(
- shape: Union[Tuple, List],
- generator: Optional[Union[List["paddle.Generator"],
- "paddle.Generator"]]=None,
- dtype: Optional["paddle.dtype"]=None,
- *kwargs, ):
+ shape: Union[Tuple, List],
+ generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
+ dtype: Optional["paddle.dtype"] = None,
+ *kwargs,
+ ):
"""This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
will always be created on CPU.
"""
if isinstance(generator, (list, tuple)):
batch_size = shape[0]
- shape = (1, ) + tuple(shape[1:])
- latents = [
- rand_pt(
- shape, generator=generator[i], dtype=dtype)
- for i in range(batch_size)
- ]
+ shape = (1,) + tuple(shape[1:])
+ latents = [rand_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
latents = paddle.concat(latents, axis=0)
else:
latents = rand_pt(shape, generator=generator, dtype=dtype)
@@ -181,18 +164,18 @@ def rand_tensor(
return latents
def randint_tensor(
- low=0,
- high=None,
- shape: Union[Tuple, List]=[1],
- generator: Optional["paddle.Generator"]=None,
- dtype: Optional["paddle.dtype"]=None,
- *kwargs, ):
+ low=0,
+ high=None,
+ shape: Union[Tuple, List] = [1],
+ generator: Optional["paddle.Generator"] = None,
+ dtype: Optional["paddle.dtype"] = None,
+ *kwargs,
+ ):
"""This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
will always be created on CPU.
"""
- latents = randint_pt(
- low=low, high=high, shape=shape, dtype=dtype, generator=generator)
+ latents = randint_pt(low=low, high=high, shape=shape, dtype=dtype, generator=generator)
return latents
diff --git a/ppdiffusers/ppdiffusers/utils/pil_utils.py b/ppdiffusers/ppdiffusers/utils/pil_utils.py
index 7d41b9c74c07a..bef4901a7e5f8 100644
--- a/ppdiffusers/ppdiffusers/utils/pil_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/pil_utils.py
@@ -18,8 +18,7 @@
from packaging import version
from PIL import Image
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse(
- "9.1.0"):
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
PIL_INTERPOLATION = {
"linear": PIL.Image.Resampling.BILINEAR,
"bilinear": PIL.Image.Resampling.BILINEAR,
@@ -60,10 +59,7 @@ def numpy_to_pil(images):
images = (images * 255).round().astype("uint8")
if images.shape[-1] == 1:
# special case for grayscale (single channel) images
- pil_images = [
- Image.fromarray(
- image.squeeze(), mode="L") for image in images
- ]
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
else:
pil_images = [Image.fromarray(image) for image in images]
diff --git a/ppdiffusers/ppdiffusers/utils/testing_utils.py b/ppdiffusers/ppdiffusers/utils/testing_utils.py
index 88a8c7e167c47..02e03ca1e944a 100644
--- a/ppdiffusers/ppdiffusers/utils/testing_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/testing_utils.py
@@ -31,10 +31,16 @@
import PIL.ImageOps
import requests
-from .import_utils import (BACKENDS_MAPPING, is_compel_available,
- is_fastdeploy_available, is_note_seq_available,
- is_opencv_available, is_paddle_available,
- is_paddle_version, is_torch_available)
+from .import_utils import (
+ BACKENDS_MAPPING,
+ is_compel_available,
+ is_fastdeploy_available,
+ is_note_seq_available,
+ is_opencv_available,
+ is_paddle_available,
+ is_paddle_version,
+ is_torch_available,
+)
from .logging import get_logger
global_rng = random.Random()
@@ -51,7 +57,8 @@
if paddle_device not in available_backends:
raise ValueError(
f"unknown paddle backend for ppdiffusers tests: {paddle_device}. Available backends are:"
- f" {available_backends}")
+ f" {available_backends}"
+ )
logger.info(f"paddle_device overrode to {paddle_device}")
else:
paddle_device = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -74,25 +81,19 @@ def paddle_all_close(a, b, *args, **kwargs):
if not is_paddle_available():
raise ValueError("Paddle needs to be installed to use this function.")
if not paddle.allclose(a, b, *args, **kwargs):
- assert (
- False
- ), f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+ assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
return True
-def print_tensor_test(tensor,
- filename="test_corrections.txt",
- expected_tensor_name="expected_slice"):
+def print_tensor_test(tensor, filename="test_corrections.txt", expected_tensor_name="expected_slice"):
test_name = os.environ.get("PYTEST_CURRENT_TEST")
if not paddle.is_tensor(tensor):
tensor = paddle.to_tensor(tensor)
- tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace(
- "\n", "")
+ tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace("\n", "")
# format is usually:
# expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161])
- output_str = tensor_str.replace("tensor",
- f"{expected_tensor_name} = np.array")
+ output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array")
test_file, test_class, test_fn = test_name.split("::")
test_fn = test_fn.split()[0]
with open(filename, "a") as f:
@@ -182,27 +183,27 @@ def require_paddle_2_5(test_case):
"""
return unittest.skipUnless(
is_paddle_available() and is_paddle_version(">=", "2.5.0"),
- "test requires Paddle 2.5", )(test_case)
+ "test requires Paddle 2.5",
+ )(test_case)
def require_paddle(test_case):
"""
Decorator marking a test that requires Paddle. These tests are skipped when Paddle isn't installed.
"""
- return unittest.skipUnless(is_paddle_available(),
- "test requires Paddle")(test_case)
+ return unittest.skipUnless(is_paddle_available(), "test requires Paddle")(test_case)
def require_torch(test_case):
"""Decorator marking a test that requires TORCH."""
- return unittest.skipUnless(is_torch_available(),
- "test requires TORCH")(test_case)
+ return unittest.skipUnless(is_torch_available(), "test requires TORCH")(test_case)
def require_paddle_gpu(test_case):
"""Decorator marking a test that requires CUDA and Paddle."""
- return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu",
- "test requires Paddle+CUDA")(test_case)
+ return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu", "test requires Paddle+CUDA")(
+ test_case
+ )
def require_compel(test_case):
@@ -210,38 +211,32 @@ def require_compel(test_case):
Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
the library is not installed.
"""
- return unittest.skipUnless(is_compel_available(),
- "test requires compel")(test_case)
+ return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
def require_fastdeploy(test_case):
"""
Decorator marking a test that requires fastdeploy. These tests are skipped when fastdeploy isn't installed.
"""
- return unittest.skipUnless(is_fastdeploy_available(),
- "test requires fastdeploy")(test_case)
+ return unittest.skipUnless(is_fastdeploy_available(), "test requires fastdeploy")(test_case)
def require_note_seq(test_case):
"""
Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
"""
- return unittest.skipUnless(is_note_seq_available(),
- "test requires note_seq")(test_case)
+ return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
-def load_numpy(arry: Union[str, np.ndarray],
- local_path: Optional[str]=None) -> np.ndarray:
+def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
if isinstance(arry, str):
# local_path = "/home/patrick_huggingface_co/"
if local_path is not None:
# local_path can be passed to correct images of tests
return os.path.join(
local_path,
- "/".join([
- arry.split("/")[-5], arry.split("/")[-2],
- arry.split("/")[-1]
- ]), )
+ "/".join([arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]]),
+ )
elif arry.startswith("http://") or arry.startswith("https://"):
response = requests.get(arry)
response.raise_for_status()
@@ -257,7 +252,8 @@ def load_numpy(arry: Union[str, np.ndarray],
else:
raise ValueError(
"Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
- " ndarray.")
+ " ndarray."
+ )
return arry
@@ -320,20 +316,17 @@ def preprocess_image(image: PIL.Image, batch_size: int):
return 2.0 * image - 1.0
-def export_to_video(video_frames: List[np.ndarray],
- output_video_path: str=None) -> str:
+def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
if is_opencv_available():
import cv2
else:
- raise ImportError(BACKENDS_MAPPING["opencv"][1].format(
- "export_to_video"))
+ raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
if output_video_path is None:
output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
- fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
h, w, c = video_frames[0].shape
- video_writer = cv2.VideoWriter(
- output_video_path, fourcc, fps=8, frameSize=(w, h))
+ video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
for i in range(len(video_frames)):
img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
video_writer.write(img)
@@ -344,7 +337,8 @@ def load_hf_numpy(path) -> np.ndarray:
if not path.startswith("http://") or path.startswith("https://"):
path = os.path.join(
"https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main",
- urllib.parse.quote(path), )
+ urllib.parse.quote(path),
+ )
return load_numpy(path)
@@ -353,7 +347,8 @@ def load_ppnlp_numpy(path) -> np.ndarray:
if not path.startswith("http://") or path.startswith("https://"):
path = os.path.join(
"https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/diffusers-testing",
- urllib.parse.quote(path), )
+ urllib.parse.quote(path),
+ )
return load_numpy(path)
@@ -444,9 +439,7 @@ def pytest_terminal_summary_main(tr, id):
f.write("slowest durations\n")
for i, rep in enumerate(dlist):
if rep.duration < durations_min:
- f.write(
- f"{len(dlist)-i} durations < {durations_min} secs were omitted"
- )
+ f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
break
f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
@@ -460,8 +453,7 @@ def summary_failures_short(tr):
msg = tr._getfailureheadline(rep)
tr.write_sep("_", msg, red=True, bold=True)
# chop off the optional leading extra frames, leaving only the last one
- longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0,
- re.M | re.S)
+ longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
tr._tw.line(longrepr)
# note: not printing out any rep.sections to keep the report short
@@ -496,9 +488,7 @@ def summary_failures_short(tr):
tr.summary_warnings() # normal warnings
tr.summary_warnings() # final warnings
- tr.reportchars = (
- "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary())
- )
+ tr.reportchars = "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary())
with open(report_files["passes"], "w") as f:
tr._tw = create_terminal_writer(config, f)
tr.summary_passes()
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
index 0940509378627..ea039412ef292 100644
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
+++ b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
@@ -20,18 +20,8 @@
# This script references https://cocodataset.org/#keypoints-eval.
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument(
- "-g",
- "--gt",
- type=str,
- help="Assign the groud true path.",
- default=None)
- parser.add_argument(
- "-d",
- "--dt",
- type=str,
- help="Assign the detection result path.",
- default=None)
+ parser.add_argument("-g", "--gt", type=str, help="Assign the groud true path.", default=None)
+ parser.add_argument("-d", "--dt", type=str, help="Assign the detection result path.", default=None)
args = parser.parse_args()
cocoGt = COCO(args.gt)
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
index 9e56042786a43..9679d0b744e9d 100644
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
+++ b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
@@ -23,6 +23,7 @@
import paddle
import paddlehub as hub
from annotator.ppdet_hrnet.det_keypoint_unite_infer import PPDetPose
+
# import PIL
from PIL import Image
from tqdm import tqdm
@@ -46,10 +47,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list):
l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
- neck_keypoint_y = int(
- (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
- neck_keypoint_x = int(
- (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
+ neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
+ neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
neck_keypoint = [
neck_keypoint_x,
neck_keypoint_y,
@@ -72,19 +71,19 @@ def __call__(self, oriImg, detect_resolution=512, hand=False):
img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
result, poseres = self.ppdetpose_pred(oriImg)
result["candidate"] = result["candidate"] * img_scalarfactor
- oriImg = cv2.resize(
- oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
+ oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
canvas = oriImg.copy()
canvas.fill(0)
- canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
- result["subset"])
+ canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
return (
canvas,
dict(
candidate=result["candidate"].tolist(),
- subset=result["subset"].tolist(), ),
- poseres, )
+ subset=result["subset"].tolist(),
+ ),
+ poseres,
+ )
def ppdetpose_pred(self, image, kpt_threshold=0.3):
poseres = self.ppdetpose.ppdet_hrnet_infer(image)
@@ -98,7 +97,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3):
for idx, item in enumerate(openpose_kpts):
if item[2] > kpt_threshold:
subset[kptid][idx] = posnum
- kpt = np.array(item + [posnum, ])
+ kpt = np.array(
+ item
+ + [
+ posnum,
+ ]
+ )
candidate = np.vstack((candidate, kpt))
posnum += 1
return {"candidate": candidate, "subset": subset}, poseres
@@ -138,7 +142,8 @@ def resize_image(input_image, resolution):
img = cv2.resize(
input_image,
(W, H),
- interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, )
+ interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+ )
return img
@@ -151,11 +156,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
out_dir_path = pathlib.Path(paths[2])
if not os.path.exists(out_dir_path):
os.makedirs(out_dir_path)
- files = sorted([
- file
- for ext in IMAGE_EXTENSIONS
- for file in in_dir_path.glob("*.{}".format(ext))
- ])
+ files = sorted([file for ext in IMAGE_EXTENSIONS for file in in_dir_path.glob("*.{}".format(ext))])
output = []
index = -1
for file in tqdm(files):
@@ -165,8 +166,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
input_image = HWC3(im)
canvas, keypoints_result, poseres = detector(input_image)
if len(paths) == 3:
- Image.fromarray(canvas).save(
- os.path.join(out_dir_path, os.path.basename(file)))
+ Image.fromarray(canvas).save(os.path.join(out_dir_path, os.path.basename(file)))
if len(poseres["keypoint"][0]) == 0:
sample_dict = {
"image_id": index,
@@ -209,76 +209,72 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
json.dumps(
{
"annotations": output,
- "images": [{
- "id": item
- } for item in list(range(index + 1))],
- "categories": [{
- "supercategory": "person",
- "id": 1,
- "name": "person",
- "keypoints": [
- "nose",
- "left_eye",
- "right_eye",
- "left_ear",
- "right_ear",
- "left_shoulder",
- "right_shoulder",
- "left_elbow",
- "right_elbow",
- "left_wrist",
- "right_wrist",
- "left_hip",
- "right_hip",
- "left_knee",
- "right_knee",
- "left_ankle",
- "right_ankle",
- ],
- "skeleton": [
- [16, 14],
- [14, 12],
- [17, 15],
- [15, 13],
- [12, 13],
- [6, 12],
- [7, 13],
- [6, 7],
- [6, 8],
- [7, 9],
- [8, 10],
- [9, 11],
- [2, 3],
- [1, 2],
- [1, 3],
- [2, 4],
- [3, 5],
- [4, 6],
- [5, 7],
- ],
- }],
+ "images": [{"id": item} for item in list(range(index + 1))],
+ "categories": [
+ {
+ "supercategory": "person",
+ "id": 1,
+ "name": "person",
+ "keypoints": [
+ "nose",
+ "left_eye",
+ "right_eye",
+ "left_ear",
+ "right_ear",
+ "left_shoulder",
+ "right_shoulder",
+ "left_elbow",
+ "right_elbow",
+ "left_wrist",
+ "right_wrist",
+ "left_hip",
+ "right_hip",
+ "left_knee",
+ "right_knee",
+ "left_ankle",
+ "right_ankle",
+ ],
+ "skeleton": [
+ [16, 14],
+ [14, 12],
+ [17, 15],
+ [15, 13],
+ [12, 13],
+ [6, 12],
+ [7, 13],
+ [6, 7],
+ [6, 8],
+ [7, 9],
+ [8, 10],
+ [9, 11],
+ [2, 3],
+ [1, 2],
+ [1, 3],
+ [2, 4],
+ [3, 5],
+ [4, 6],
+ [5, 7],
+ ],
+ }
+ ],
},
- indent=4, ))
+ indent=4,
+ )
+ )
else:
json_file.write(json.dumps(output, indent=4))
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
- "--do_gt",
- action="store_true",
- help="whether to predict unseen future data")
+parser.add_argument("--do_gt", action="store_true", help="whether to predict unseen future data")
parser.add_argument(
"path",
type=str,
nargs=3,
- help=(
- "Paths to the input images dir, output json file, and output openpose images dir"
- ), )
+ help=("Paths to the input images dir, output json file, and output openpose images dir"),
+)
-IMAGE_EXTENSIONS = {
- "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
if __name__ == "__main__":
args = parser.parse_args()
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
index 015f143827f2b..d5fb70e8d90c9 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
@@ -16,17 +16,24 @@
import paddle
import torch
-from diffusers import \
- StableDiffusionImageVariationPipeline as \
- DiffusersStableDiffusionImageVariationPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig,
- CLIPVisionModelWithProjection)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
-from ppdiffusers import \
- StableDiffusionImageVariationPipeline as \
- PPDiffusersStableDiffusionImageVariationPipeline
+from diffusers import (
+ StableDiffusionImageVariationPipeline as DiffusersStableDiffusionImageVariationPipeline,
+)
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ppdiffusers import (
+ StableDiffusionImageVariationPipeline as PPDiffusersStableDiffusionImageVariationPipeline,
+)
from ppdiffusers import UNet2DConditionModel
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -47,10 +54,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
return new_vae_or_unet
-def convert_hf_clip_to_ppnlp_clip(clip,
- dtype="float32",
- is_text_encoder=True,
- need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
new_model_state = {}
transformers2ppnlp = {
".encoder.": ".transformer.",
@@ -69,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -85,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name and need_prefix:
name = "clip." + name
@@ -123,8 +125,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
"vision_heads": clip.config.num_attention_heads,
"vision_embed_dim": clip.config.hidden_size,
"vision_patch_size": clip.config.patch_size,
- "vision_mlp_ratio":
- clip.config.intermediate_size // clip.config.hidden_size,
+ "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
"vision_hidden_act": clip.config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
@@ -148,17 +149,19 @@ def check_keys(model, state_dict):
print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
- output_path=None):
+def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersStableDiffusionImageVariationPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False)
+ diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
+ )
safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True)
+ diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True
+ )
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -173,18 +176,14 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
# make sure
vision_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
- safety_checker_config.update({
- "projection_dim": pp_unet.config.cross_attention_dim
- })
+ safety_checker_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
# 3. image_encoder
- image_encoder = CLIPVisionModelWithProjection(
- CLIPVisionConfig.from_dict(vision_config))
+ image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
image_encoder.set_dict(image_encoder_state_dict)
check_keys(image_encoder, image_encoder_state_dict)
# 4. safety_checker
- pp_safety_checker = StableDiffusionSafetyChecker(
- CLIPVisionConfig.from_dict(safety_checker_config))
+ pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
pp_safety_checker.set_dict(safety_checker_state_dict)
check_keys(pp_safety_checker, safety_checker_state_dict)
# 5. scheduler
@@ -200,12 +199,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -214,12 +211,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
- pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4/feature_extractor")
+ pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
# 7. create ppdiffusers pipe
paddle_pipe = PPDiffusersStableDiffusionImageVariationPipeline(
@@ -228,15 +225,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
unet=pp_unet,
safety_checker=pp_safety_checker,
feature_extractor=pp_feature_extractor,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 8. save_pretrained
paddle_pipe.save_pretrained(output_path)
return paddle_pipe
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -247,7 +244,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
"--output_path",
type=str,
default="sd-image-variations-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
- ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
index 756e12bb3c97b..f0a64446ba1d7 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
@@ -17,14 +17,21 @@
import paddle
import torch
-from diffusers import \
- StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline
+from diffusers import (
+ StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline,
+)
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- LMSDiscreteScheduler, PNDMScheduler)
-from ppdiffusers import \
- StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ppdiffusers import (
+ StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline,
+)
from ppdiffusers import UNet2DConditionModel
paddle.set_device("cpu")
@@ -63,9 +70,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -79,7 +84,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name:
name = "clip." + name
@@ -104,23 +109,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
"vision_heads": clip.config.vision_config.num_attention_heads,
"vision_embed_dim": clip.config.vision_config.hidden_size,
"vision_patch_size": clip.config.vision_config.patch_size,
- "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
- clip.config.vision_config.hidden_size,
+ "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
"vision_hidden_act": clip.config.vision_config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
return new_model_state, new_config
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersStableDiffusionUpscalePipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True)
+ diffusers_pipe.text_encoder, is_text_encoder=True
+ )
max_noise_level = diffusers_pipe.max_noise_level
# 1. vae
@@ -134,8 +139,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
pp_unet.set_dict(unet_state_dict)
# 3. text_encoder
- pp_text_encoder = CLIPTextModel(
- CLIPTextConfig.from_dict(text_encoder_config))
+ pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
# 4. scheduler
@@ -150,12 +154,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
beta_schedule=beta_schedule,
beta_start=beta_start,
num_train_timesteps=num_train_timesteps,
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule=beta_schedule)
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule)
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -164,7 +166,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
clip_sample=False,
prediction_type="v_prediction",
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
@@ -183,18 +186,19 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "ddpm" in scheduler_type:
pp_low_res_scheduler = DDPMScheduler(
beta_end=beta_end,
beta_schedule=beta_schedule,
beta_start=beta_start,
- num_train_timesteps=num_train_timesteps, )
+ num_train_timesteps=num_train_timesteps,
+ )
elif "lms" in scheduler_type:
pp_low_res_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule=beta_schedule)
+ beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule
+ )
elif "ddim" in scheduler_type:
pp_low_res_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -203,7 +207,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
@@ -219,7 +224,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
tokenizer=pp_tokenizer,
unet=pp_unet,
low_res_scheduler=pp_low_res_scheduler,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 9. save_pretrained
paddle_pipe.save_pretrained(output_path)
@@ -227,8 +233,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -239,7 +244,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
"--output_path",
type=str,
default="stable-diffusion-x4-upscaler-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
index 4c29f3059b3a1..b3e0ece7e6a03 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
@@ -17,16 +17,26 @@
import paddle
import torch
-from diffusers import \
- VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline
+from diffusers import VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline
from paddlenlp.transformers import (
- CLIPFeatureExtractor, CLIPTextConfig, CLIPTextModelWithProjection,
- CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection)
+ CLIPFeatureExtractor,
+ CLIPTextConfig,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers import \
- VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
+from ppdiffusers import (
+ VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline,
+)
from ppdiffusers.pipelines.versatile_diffusion import UNetFlatConditionModel
paddle.set_device("cpu")
@@ -46,10 +56,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
return new_vae_or_unet
-def convert_hf_clip_to_ppnlp_clip(clip,
- dtype="float32",
- is_text_encoder=True,
- need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
new_model_state = {}
transformers2ppnlp = {
".encoder.": ".transformer.",
@@ -68,9 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -84,7 +89,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name and need_prefix:
name = "clip." + name
@@ -122,8 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
"vision_heads": clip.config.num_attention_heads,
"vision_embed_dim": clip.config.hidden_size,
"vision_patch_size": clip.config.patch_size,
- "vision_mlp_ratio":
- clip.config.intermediate_size // clip.config.hidden_size,
+ "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
"vision_hidden_act": clip.config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
@@ -147,20 +151,22 @@ def check_keys(model, state_dict):
print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
- output_path=None):
+def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersVersatileDiffusionPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
image_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.image_unet)
text_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_unet)
text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False)
+ diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
+ )
image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False)
+ diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
+ )
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -179,14 +185,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
check_keys(pp_text_unet, text_unet_state_dict)
# 4. image_encoder
- pp_image_encoder = CLIPVisionModelWithProjection(
- CLIPVisionConfig.from_dict(vision_config))
+ pp_image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
pp_image_encoder.set_dict(image_encoder_state_dict)
check_keys(pp_image_encoder, image_encoder_state_dict)
# 5. text_encoder
- pp_text_encoder = CLIPTextModelWithProjection(
- CLIPTextConfig.from_dict(text_config))
+ pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
check_keys(pp_text_encoder, text_encoder_state_dict)
@@ -203,12 +207,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -217,13 +219,13 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
with tempfile.TemporaryDirectory() as tmpdirname:
- pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4/feature_extractor")
+ pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
# 7. tokenizer
diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
@@ -236,15 +238,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
image_unet=pp_image_unet,
text_unet=pp_text_unet,
vae=pp_vae,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 9. save_pretrained
paddle_pipe.save_pretrained(output_path)
return paddle_pipe
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -255,7 +257,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
"--output_path",
type=str,
default="versatile-diffusion-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
- ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
index 62de6daa072d9..ff8c68985a249 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
@@ -18,17 +18,28 @@
import paddle
import torch
from diffusers import AltDiffusionPipeline as DiffusersAltDiffusionPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig,
- XLMRobertaTokenizer)
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPVisionConfig,
+ XLMRobertaTokenizer,
+)
from ppdiffusers import AltDiffusionPipeline as PPDiffusersAltDiffusionPipeline
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
- UNet2DConditionModel)
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
- RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+ RobertaSeriesConfig,
+ RobertaSeriesModelWithTransformation,
+)
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
paddle.set_device("cpu")
@@ -67,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -83,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name:
name = "clip." + name
@@ -108,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
"vision_heads": clip.config.vision_config.num_attention_heads,
"vision_embed_dim": clip.config.vision_config.hidden_size,
"vision_patch_size": clip.config.vision_config.patch_size,
- "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
- clip.config.vision_config.hidden_size,
+ "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
"vision_hidden_act": clip.config.vision_config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
@@ -119,10 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"):
new_model_state = {}
mappings = [
- [
- "embeddings.word_embeddings.weight",
- "embeddings.word_embeddings.weight"
- ],
+ ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
[
"embeddings.position_embeddings.weight",
"embeddings.position_embeddings.weight",
@@ -224,21 +229,17 @@ def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"):
hf_name = prefix + hf_name
pp_name = prefix + pp_name
if need_transpose:
- new_model_state[pp_name] = (
- state_dict[hf_name].t().cpu().numpy().astype(dtype))
+ new_model_state[pp_name] = state_dict[hf_name].t().cpu().numpy().astype(dtype)
else:
- new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(
- dtype)
+ new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(dtype)
new_config = xlm_roberta.config.to_dict()
return new_model_state, new_config
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
- diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
(
@@ -246,7 +247,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
text_encoder_config,
) = convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(diffusers_pipe.text_encoder)
safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.safety_checker, is_text_encoder=False)
+ diffusers_pipe.safety_checker, is_text_encoder=False
+ )
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -264,8 +266,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
pp_text_encoder.set_dict(text_encoder_state_dict)
# 4. safety_checker
- pp_safety_checker = StableDiffusionSafetyChecker(
- CLIPVisionConfig.from_dict(safety_checker_config))
+ pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
pp_safety_checker.set_dict(safety_checker_state_dict)
# 5. scheduler
@@ -281,7 +282,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
num_train_timesteps=num_train_timesteps,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -296,8 +298,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
elif scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif scheduler_type == "ddim":
@@ -308,8 +309,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
with tempfile.TemporaryDirectory() as tmpdirname:
# 6. feature_extractor
# diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
- pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4/feature_extractor")
+ pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
# 7. tokenizer
diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
@@ -323,15 +323,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
unet=pp_unet,
safety_checker=pp_safety_checker,
feature_extractor=pp_feature_extractor,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
# 9. save_pretrained
paddle_pipe.save_pretrained(output_path)
return paddle_pipe
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -342,7 +342,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
"--output_path",
type=str,
default="AltDiffusion-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
index 17aa70d3ef95a..bd8d3e8bbb152 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
@@ -40,8 +40,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -52,11 +51,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
"--output_path",
type=str,
default="paddle_models/sd-controlnet-canny",
- help="The output path.", )
+ help="The output path.",
+ )
args = parser.parse_args()
- th_controlnet = DiffusersControlNetModel.from_pretrained(
- args.pretrained_model_name_or_path)
+ th_controlnet = DiffusersControlNetModel.from_pretrained(args.pretrained_model_name_or_path)
controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
pp_controlnet = PPDiffusersControlNetModel.from_config(th_controlnet.config)
pp_controlnet.set_dict(controlnet_state_dict)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
index 021da51309528..7cd30d3c3e077 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
@@ -21,10 +21,8 @@
from paddlenlp.transformers import BertTokenizer
from ppdiffusers import AutoencoderKL, DDIMScheduler, LDMBertModel
-from ppdiffusers import \
- LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline
-from ppdiffusers import (LMSDiscreteScheduler, PNDMScheduler,
- UNet2DConditionModel)
+from ppdiffusers import LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline
+from ppdiffusers import LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
paddle.set_device("cpu")
@@ -87,15 +85,14 @@ def convert_hf_ldmbert_to_ppnlp_ldmbert(ldmbert, dtype="float32"):
return new_model_state, new_config
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersLDMTextToImagePipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
- bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(
- diffusers_pipe.bert)
+ bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(diffusers_pipe.bert)
# 1. vqvae
pp_vqvae = AutoencoderKL.from_config(diffusers_pipe.vqvae.config)
@@ -123,12 +120,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -137,15 +132,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
with tempfile.TemporaryDirectory() as tmpdirname:
# 5. tokenizer
diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
- pp_tokenizer = BertTokenizer.from_pretrained(
- tmpdirname, model_max_length=77)
+ pp_tokenizer = BertTokenizer.from_pretrained(tmpdirname, model_max_length=77)
# 6. create ppdiffusers pipe
paddle_pipe = PPDiffusersLDMTextToImagePipeline(
@@ -153,7 +148,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
bert=pp_bert,
tokenizer=pp_tokenizer,
unet=pp_unet,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 7. save_pretrained
paddle_pipe.save_pretrained(output_path)
@@ -161,8 +157,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -173,7 +168,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
"--output_path",
type=str,
default="ldm-text2im-large-256-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
index 212808cd405fa..519d032808939 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
@@ -18,15 +18,16 @@
import paddle
import torch
from diffusers import PaintByExamplePipeline as DiffusersPaintByExamplePipeline
+
# CLIPImageProcessor need paddlenlp latest
from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
from ppdiffusers import AutoencoderKL
-from ppdiffusers import \
- PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline
+from ppdiffusers import PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline
from ppdiffusers import PNDMScheduler, UNet2DConditionModel
-from ppdiffusers.pipelines.paint_by_example.image_encoder import \
- PaintByExampleImageEncoder
+from ppdiffusers.pipelines.paint_by_example.image_encoder import (
+ PaintByExampleImageEncoder,
+)
paddle.set_device("cpu")
@@ -63,9 +64,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
".post_layernorm.": ".ln_post.",
}
ignore_value = ["position_ids", "mapper"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids and mapper
@@ -79,7 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
new_model_state[name] = value.cpu().numpy().astype(dtype)
@@ -93,8 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
"vision_heads": clip.config.num_attention_heads,
"vision_embed_dim": clip.config.hidden_size,
"vision_patch_size": clip.config.patch_size,
- "vision_mlp_ratio":
- clip.config.intermediate_size // clip.config.hidden_size,
+ "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
"vision_hidden_act": clip.config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
@@ -118,15 +116,14 @@ def check_keys(model, state_dict):
print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-def convert_diffusers_paintbyexample_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_paintbyexample_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersPaintByExamplePipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
- image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.image_encoder)
+ image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.image_encoder)
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -138,8 +135,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
check_keys(pp_unet, unet_state_dict)
# 3. image_encoder
- pp_image_encoder = PaintByExampleImageEncoder(
- CLIPVisionConfig.from_dict(image_encoder_config))
+ pp_image_encoder = PaintByExampleImageEncoder(CLIPVisionConfig.from_dict(image_encoder_config))
pp_image_encoder.set_dict(image_encoder_state_dict)
check_keys(pp_image_encoder, image_encoder_state_dict)
# 4. scheduler
@@ -158,7 +154,8 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
scheduler=pp_scheduler,
safety_checker=None,
feature_extractor=feature_extractor,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
# 6. save_pretrained
paddle_pipe.save_pretrained(output_path)
@@ -166,8 +163,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -178,7 +174,9 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
"--output_path",
type=str,
default="./Paint-by-Example",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_paintbyexample_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
index f1d3d6bd2462f..fa189095cbb9d 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
@@ -17,17 +17,22 @@
import paddle
import torch
-from diffusers import \
- StableDiffusionDepth2ImgPipeline as \
- DiffusersStableDiffusionDepth2ImgPipeline
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
- CLIPTokenizer, DPTConfig,
- DPTForDepthEstimation, DPTImageProcessor)
+from diffusers import (
+ StableDiffusionDepth2ImgPipeline as DiffusersStableDiffusionDepth2ImgPipeline,
+)
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+ DPTConfig,
+ DPTForDepthEstimation,
+ DPTImageProcessor,
+)
from ppdiffusers import AutoencoderKL, PNDMScheduler
-from ppdiffusers import \
- StableDiffusionDepth2ImgPipeline as \
- PPDiffusersStableDiffusionDepth2ImgPipeline
+from ppdiffusers import (
+ StableDiffusionDepth2ImgPipeline as PPDiffusersStableDiffusionDepth2ImgPipeline,
+)
from ppdiffusers import UNet2DConditionModel
paddle.set_device("cpu")
@@ -66,9 +71,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -82,7 +85,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
new_model_state[name] = value.cpu().numpy().astype(dtype)
@@ -117,17 +120,15 @@ def check_keys(model, state_dict):
print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersStableDiffusionDepth2ImgPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
- depth_estimator_state_dict = convert_to_ppdiffusers(
- diffusers_pipe.depth_estimator)
- text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder)
+ depth_estimator_state_dict = convert_to_ppdiffusers(diffusers_pipe.depth_estimator)
+ text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.text_encoder)
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -138,8 +139,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
pp_unet.set_dict(unet_state_dict)
check_keys(pp_unet, unet_state_dict)
# 3. text_encoder
- pp_text_encoder = CLIPTextModel(
- CLIPTextConfig.from_dict(text_encoder_config))
+ pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
check_keys(pp_text_encoder, text_encoder_state_dict)
# 4. scheduler
@@ -168,7 +168,8 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
unet=pp_unet,
feature_extractor=pp_feature_extractor,
depth_estimator=pp_depth_estimator,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 9. save_pretrained
paddle_pipe.save_pretrained(output_path)
@@ -176,8 +177,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -188,7 +188,9 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
"--output_path",
type=str,
default="stable-diffusion-2-depth",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
index 9ec5f95b55248..bd8178c872874 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
@@ -17,18 +17,27 @@
import paddle
import torch
-from diffusers import \
- StableDiffusionControlNetPipeline as \
- DiffusersStableDiffusionControlNetPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig,
- CLIPTextModel, CLIPTokenizer,
- CLIPVisionConfig)
-
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
- LMSDiscreteScheduler, PNDMScheduler)
-from ppdiffusers import \
- StableDiffusionControlNetPipeline as \
- PPDiffusersStableDiffusionControlNetPipeline
+from diffusers import (
+ StableDiffusionControlNetPipeline as DiffusersStableDiffusionControlNetPipeline,
+)
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ppdiffusers import (
+ StableDiffusionControlNetPipeline as PPDiffusersStableDiffusionControlNetPipeline,
+)
from ppdiffusers import UNet2DConditionModel
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -69,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -85,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name:
name = "clip." + name
@@ -110,26 +117,25 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
"vision_heads": clip.config.vision_config.num_attention_heads,
"vision_embed_dim": clip.config.vision_config.hidden_size,
"vision_patch_size": clip.config.vision_config.patch_size,
- "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
- clip.config.vision_config.hidden_size,
+ "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
"vision_hidden_act": clip.config.vision_config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
return new_model_state, new_config
-def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersStableDiffusionControlNetPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
- requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker",
- False)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
+ requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
controlnet_state_dict = convert_to_ppdiffusers(diffusers_pipe.controlnet)
text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True)
+ diffusers_pipe.text_encoder, is_text_encoder=True
+ )
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -142,14 +148,12 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
pp_unet.set_dict(unet_state_dict)
# 3. controlnet
- pp_controlnet = ControlNetModel.from_config(
- diffusers_pipe.controlnet.config)
+ pp_controlnet = ControlNetModel.from_config(diffusers_pipe.controlnet.config)
pp_controlnet.set_dict(controlnet_state_dict)
# 4. text_encoder
- pp_text_encoder = CLIPTextModel(
- CLIPTextConfig.from_dict(text_encoder_config))
+ pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
# 5. scheduler
@@ -165,12 +169,10 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -179,7 +181,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
@@ -192,14 +195,14 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
# 7. feature_extractor
# diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4/feature_extractor")
+ "CompVis/stable-diffusion-v1-4/feature_extractor"
+ )
# 8. safety_checker
(
safety_checker_state_dict,
- safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.safety_checker, is_text_encoder=False)
- pp_safety_checker = StableDiffusionSafetyChecker(
- CLIPVisionConfig.from_dict(safety_checker_config))
+ safety_checker_config,
+ ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False)
+ pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
pp_safety_checker.set_dict(safety_checker_state_dict)
# 9. create ppdiffusers pipe
paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
@@ -210,7 +213,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
controlnet=pp_controlnet,
safety_checker=pp_safety_checker,
feature_extractor=pp_feature_extractor,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
else:
# 9. create ppdiffusers pipe
paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
@@ -222,7 +226,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
safety_checker=None,
feature_extractor=None,
scheduler=pp_scheduler,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
_internal_dict = dict(paddle_pipe._internal_dict)
if _internal_dict["_ppdiffusers_version"] == "0.0.0":
@@ -234,8 +239,7 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -246,7 +250,9 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
"--output_path",
type=str,
default="control_sd15_canny-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
index 6d3811cc0bc82..a3374a432caa4 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
@@ -17,16 +17,22 @@
import paddle
import torch
-from diffusers import \
- StableDiffusionPipeline as DiffusersStableDiffusionPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig,
- CLIPTextModel, CLIPTokenizer,
- CLIPVisionConfig)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler)
-from ppdiffusers import \
- StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
+from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline
+from paddlenlp.transformers import (
+ CLIPFeatureExtractor,
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
from ppdiffusers import UNet2DConditionModel
from ppdiffusers.configuration_utils import FrozenDict
from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -67,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -83,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name:
name = "clip." + name
@@ -108,25 +112,24 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
"vision_heads": clip.config.vision_config.num_attention_heads,
"vision_embed_dim": clip.config.vision_config.hidden_size,
"vision_patch_size": clip.config.vision_config.patch_size,
- "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
- clip.config.vision_config.hidden_size,
+ "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
"vision_hidden_act": clip.config.vision_config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
return new_model_state, new_config
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
- pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
diffusers_pipe = DiffusersStableDiffusionPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
- requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker",
- False)
+ pretrained_model_name_or_path, use_auth_token=True
+ )
+ requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True)
+ diffusers_pipe.text_encoder, is_text_encoder=True
+ )
# 1. vae
pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -139,8 +142,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
pp_unet.set_dict(unet_state_dict)
# 3. text_encoder
- pp_text_encoder = CLIPTextModel(
- CLIPTextConfig.from_dict(text_encoder_config))
+ pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
# 4. scheduler
@@ -156,12 +158,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif "lms" in scheduler_type:
- pp_scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif "ddim" in scheduler_type:
pp_scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -170,7 +170,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
@@ -183,14 +184,14 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
# 6. feature_extractor
# diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
- "CompVis/stable-diffusion-v1-4/feature_extractor")
+ "CompVis/stable-diffusion-v1-4/feature_extractor"
+ )
# 7. safety_checker
(
safety_checker_state_dict,
- safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.safety_checker, is_text_encoder=False)
- pp_safety_checker = StableDiffusionSafetyChecker(
- CLIPVisionConfig.from_dict(safety_checker_config))
+ safety_checker_config,
+ ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False)
+ pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
pp_safety_checker.set_dict(safety_checker_state_dict)
# 8. create ppdiffusers pipe
paddle_pipe = PPDiffusersStableDiffusionPipeline(
@@ -200,7 +201,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
unet=pp_unet,
safety_checker=pp_safety_checker,
feature_extractor=pp_feature_extractor,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
else:
# 8. create ppdiffusers pipe
paddle_pipe = PPDiffusersStableDiffusionPipeline(
@@ -211,7 +213,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
safety_checker=None,
feature_extractor=None,
scheduler=pp_scheduler,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
_internal_dict = dict(paddle_pipe._internal_dict)
if _internal_dict["_ppdiffusers_version"] == "0.0.0":
@@ -223,8 +226,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -235,7 +237,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
"--output_path",
type=str,
default="stable-diffusion-v1-5-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
index c5c28bfce9e02..204766187c39c 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
@@ -18,8 +18,11 @@
import paddle
import torch
from diffusers import UnCLIPPipeline as DiffusersUnCLIPPipeline
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection,
- CLIPTokenizer)
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
from ppdiffusers import PriorTransformer
from ppdiffusers import UnCLIPPipeline as PPDiffusersUnCLIPPipeline
@@ -43,10 +46,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32", prefix=""):
return new_vae_or_unet
-def convert_hf_clip_to_ppnlp_clip(clip,
- dtype="float32",
- is_text_encoder=True,
- need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
new_model_state = {}
transformers2ppnlp = {
".encoder.": ".transformer.",
@@ -65,9 +65,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -81,7 +79,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name and need_prefix:
name = "clip." + name
@@ -119,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
"vision_heads": clip.config.num_attention_heads,
"vision_embed_dim": clip.config.hidden_size,
"vision_patch_size": clip.config.patch_size,
- "vision_mlp_ratio":
- clip.config.intermediate_size // clip.config.hidden_size,
+ "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
"vision_hidden_act": clip.config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
@@ -144,20 +141,17 @@ def check_keys(model, state_dict):
print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
- output_path=None):
+def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
- diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
prior_state_dict = convert_to_ppdiffusers(diffusers_pipe.prior)
decoder_state_dict = convert_to_ppdiffusers(diffusers_pipe.decoder)
text_proj_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_proj)
- super_res_first_state_dict = convert_to_ppdiffusers(
- diffusers_pipe.super_res_first)
- super_res_last_state_dict = convert_to_ppdiffusers(
- diffusers_pipe.super_res_last)
+ super_res_first_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_first)
+ super_res_last_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_last)
text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False)
+ diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
+ )
pp_prior = PriorTransformer.from_config(diffusers_pipe.prior.config)
pp_prior.set_dict(prior_state_dict)
@@ -167,32 +161,25 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
pp_decoder.set_dict(decoder_state_dict)
check_keys(pp_decoder, decoder_state_dict)
- pp_text_proj = UnCLIPTextProjModel.from_config(
- diffusers_pipe.text_proj.config)
+ pp_text_proj = UnCLIPTextProjModel.from_config(diffusers_pipe.text_proj.config)
pp_text_proj.set_dict(text_proj_state_dict)
check_keys(pp_text_proj, text_proj_state_dict)
- pp_super_res_first = UNet2DModel.from_config(
- diffusers_pipe.super_res_first.config)
+ pp_super_res_first = UNet2DModel.from_config(diffusers_pipe.super_res_first.config)
pp_super_res_first.set_dict(super_res_first_state_dict)
check_keys(pp_super_res_first, super_res_first_state_dict)
- pp_super_res_last = UNet2DModel.from_config(
- diffusers_pipe.super_res_last.config)
+ pp_super_res_last = UNet2DModel.from_config(diffusers_pipe.super_res_last.config)
pp_super_res_last.set_dict(super_res_last_state_dict)
check_keys(pp_super_res_last, super_res_last_state_dict)
- pp_text_encoder = CLIPTextModelWithProjection(
- CLIPTextConfig.from_dict(text_config))
+ pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
check_keys(pp_text_encoder, text_encoder_state_dict)
- pp_prior_scheduler = UnCLIPScheduler.from_config(
- diffusers_pipe.prior_scheduler.config)
- pp_decoder_scheduler = UnCLIPScheduler.from_config(
- diffusers_pipe.decoder_scheduler.config)
- pp_super_res_scheduler = UnCLIPScheduler.from_config(
- diffusers_pipe.super_res_scheduler.config)
+ pp_prior_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.prior_scheduler.config)
+ pp_decoder_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.decoder_scheduler.config)
+ pp_super_res_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.super_res_scheduler.config)
with tempfile.TemporaryDirectory() as tmpdirname:
# 5. feature_extractor
@@ -209,15 +196,15 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
super_res_last=pp_super_res_last,
prior_scheduler=pp_prior_scheduler,
decoder_scheduler=pp_decoder_scheduler,
- super_res_scheduler=pp_super_res_scheduler, )
+ super_res_scheduler=pp_super_res_scheduler,
+ )
# 6. save_pretrained
paddle_pipe.save_pretrained(output_path)
return paddle_pipe
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -228,7 +215,7 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
"--output_path",
type=str,
default="./karlo-v1-alpha",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
- ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
index eb8c950cc052e..d5c0fad1746bf 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
@@ -23,8 +23,7 @@
from ppdiffusers import Transformer2DModel
from ppdiffusers import VQDiffusionPipeline as PPDiffusersVQDiffusionPipeline
from ppdiffusers import VQDiffusionScheduler, VQModel
-from ppdiffusers.pipelines.vq_diffusion import \
- LearnedClassifierFreeSamplingEmbeddings
+from ppdiffusers.pipelines.vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
paddle.set_device("cpu")
@@ -62,9 +61,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
# step1: ignore position_ids
@@ -78,7 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
if "vision_model" in name:
name = "clip." + name
@@ -103,20 +100,17 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
"vision_heads": clip.config.vision_config.num_attention_heads,
"vision_embed_dim": clip.config.vision_config.hidden_size,
"vision_patch_size": clip.config.vision_config.patch_size,
- "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
- clip.config.vision_config.hidden_size,
+ "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
"vision_hidden_act": clip.config.vision_config.hidden_act,
"projection_dim": clip.config.projection_dim,
}
return new_model_state, new_config
-def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
- output_path=None):
+def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
# 0. load diffusers pipe and convert to ppdiffusers weights format
- diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(
- pretrained_model_name_or_path, use_auth_token=True)
+ diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
# 1. vqvae
vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
@@ -124,35 +118,33 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
transformer_state_dict = convert_to_ppdiffusers(diffusers_pipe.transformer)
# 3. learned_classifier_free_sampling_embeddings
learned_classifier_free_sampling_embeddings_state_dict = convert_to_ppdiffusers(
- diffusers_pipe.learned_classifier_free_sampling_embeddings)
+ diffusers_pipe.learned_classifier_free_sampling_embeddings
+ )
# 4.text_encoder
text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
- diffusers_pipe.text_encoder, is_text_encoder=True)
+ diffusers_pipe.text_encoder, is_text_encoder=True
+ )
# 1. vqvae
pp_vqvae = VQModel.from_config(diffusers_pipe.vqvae.config)
pp_vqvae.set_dict(vqvae_state_dict)
# 2. transformer
- pp_transformer = Transformer2DModel.from_config(
- diffusers_pipe.transformer.config)
+ pp_transformer = Transformer2DModel.from_config(diffusers_pipe.transformer.config)
pp_transformer.set_dict(transformer_state_dict)
# 3. pp_learned_classifier_free_sampling_embeddings
- pp_learned_classifier_free_sampling_embeddings = (
- LearnedClassifierFreeSamplingEmbeddings.from_config(
- diffusers_pipe.learned_classifier_free_sampling_embeddings.config))
- pp_learned_classifier_free_sampling_embeddings.set_dict(
- learned_classifier_free_sampling_embeddings_state_dict)
+ pp_learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings.from_config(
+ diffusers_pipe.learned_classifier_free_sampling_embeddings.config
+ )
+ pp_learned_classifier_free_sampling_embeddings.set_dict(learned_classifier_free_sampling_embeddings_state_dict)
# 4. text_encoder
- pp_text_encoder = CLIPTextModel(
- CLIPTextConfig.from_dict(text_encoder_config))
+ pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
pp_text_encoder.set_dict(text_encoder_state_dict)
# 5. scheduler
- pp_scheduler = VQDiffusionScheduler.from_config(
- diffusers_pipe.scheduler.config)
+ pp_scheduler = VQDiffusionScheduler.from_config(diffusers_pipe.scheduler.config)
with tempfile.TemporaryDirectory() as tmpdirname:
# 6. tokenizer
@@ -166,7 +158,8 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
tokenizer=pp_tokenizer,
transformer=pp_transformer,
learned_classifier_free_sampling_embeddings=pp_learned_classifier_free_sampling_embeddings,
- scheduler=pp_scheduler, )
+ scheduler=pp_scheduler,
+ )
# 8. save_pretrained
paddle_pipe.save_pretrained(output_path)
@@ -174,8 +167,7 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Pytorch model weights to Paddle model weights.")
+ parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -186,7 +178,9 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
"--output_path",
type=str,
default="microsoft/vq-diffusion-ithq-ppdiffusers",
- help="The model output path.", )
+ help="The model output path.",
+ )
args = parser.parse_args()
ppdiffusers_pipe = convert_diffusers_vq_diffusion_to_ppdiffusers(
- args.pretrained_model_name_or_path, args.output_path)
+ args.pretrained_model_name_or_path, args.output_path
+ )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
index 41b5460d10922..b57a9ef31149d 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
@@ -30,10 +30,17 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
- StableDiffusionPipeline, UNet2DConditionModel)
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
paddle.set_device("cpu")
MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
FILENAME = f"archive/{file_name}".encode("latin")
padding_size_plus_fbxx = 4 + 14
data_iostream = []
- offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
- FILENAME) + padding_size_plus_fbxx
+ offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
with open(file, "rb") as r:
r.seek(offset)
for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
return out, offset + len(out)
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
- backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
if isinstance(storage, TensorMeta):
storage.size = size
return storage
@@ -169,8 +174,7 @@ def persistent_load_stage1(saved_id):
data_iostream, pre_offset = get_data_iostream(path, file_name="data.pkl")
# 1. read the structure of storage
- unpickler_stage1 = UnpicklerWrapperStage(
- io.BytesIO(data_iostream), **pickle_load_args)
+ unpickler_stage1 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
unpickler_stage1.persistent_load = persistent_load_stage1
result_stage1 = unpickler_stage1.load()
@@ -202,17 +206,15 @@ def extract_maybe_dict(result):
# `MZ_ZIP_LOCAL_DIR_HEADER_SIZE` is from: https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/inline_container.cc#L186
# `16` is the fixed characters size from binary file.
# `filename_with_fb` is the length of dynamic data key name
- file_handler.seek(
- MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
+ file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
- padding_offset = np.frombuffer(
- file_handler.read(2)[:1], dtype=np.uint8)[0]
+ padding_offset = np.frombuffer(file_handler.read(2)[:1], dtype=np.uint8)[0]
file_handler.read(padding_offset)
# save the tensor info in result to re-use memory
stage1_key_to_tensor[key] = np.frombuffer(
- file_handler.read(tensor_meta.nbytes),
- dtype=tensor_meta.dtype).reshape(tensor_meta.size)
+ file_handler.read(tensor_meta.nbytes), dtype=tensor_meta.dtype
+ ).reshape(tensor_meta.size)
def persistent_load_stage2(saved_id):
assert isinstance(saved_id, tuple)
@@ -220,8 +222,7 @@ def persistent_load_stage2(saved_id):
return stage1_key_to_tensor[key]
# 4. read the structure of storage
- unpickler_stage2 = UnpicklerWrapperStage(
- io.BytesIO(data_iostream), **pickle_load_args)
+ unpickler_stage2 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
unpickler_stage2.persistent_load = persistent_load_stage2
result_stage2 = unpickler_stage2.load()
@@ -253,8 +254,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -270,8 +270,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -313,8 +312,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -322,12 +320,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming
to them. It splits attention layers, and takes into account additional replacements
@@ -335,9 +334,7 @@ def assign_to_checkpoint(
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -345,13 +342,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = np.split(old_tensor, 3, axis=1)
@@ -363,8 +358,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -374,8 +368,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -403,34 +396,28 @@ def create_unet_diffusers_config(original_config, image_size: int):
unet_params = original_config.model.params.unet_config.params
vae_params = original_config.model.params.first_stage_config.params.ddconfig
- block_out_channels = [
- unet_params.model_channels * mult for mult in unet_params.channel_mult
- ]
+ block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
down_block_types = []
resolution = 1
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnDownBlock2D"
- if resolution in unet_params.attention_resolutions else
- "DownBlock2D")
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
down_block_types.append(block_type)
if i != len(block_out_channels) - 1:
resolution *= 2
up_block_types = []
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnUpBlock2D"
- if resolution in unet_params.attention_resolutions else
- "UpBlock2D")
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
up_block_types.append(block_type)
resolution //= 2
- vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+ vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
head_dim = unet_params.num_heads if "num_heads" in unet_params else None
- use_linear_projection = (unet_params.use_linear_in_transformer
- if "use_linear_in_transformer" in unet_params else
- False)
+ use_linear_projection = (
+ unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+ )
if use_linear_projection:
# stable diffusion 2-base-512 and 2-768
if head_dim is None:
@@ -446,7 +433,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
layers_per_block=unet_params.num_res_blocks,
cross_attention_dim=unet_params.context_dim,
attention_head_dim=head_dim,
- use_linear_projection=use_linear_projection, )
+ use_linear_projection=use_linear_projection,
+ )
return config
@@ -470,7 +458,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
up_block_types=tuple(up_block_types),
block_out_channels=tuple(block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
@@ -479,14 +468,12 @@ def create_diffusers_schedular(original_config):
num_train_timesteps=original_config.model.params.timesteps,
beta_start=original_config.model.params.linear_start,
beta_end=original_config.model.params.linear_end,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
return schedular
-def convert_ldm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -507,8 +494,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
print(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -521,17 +507,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint = {}
- new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
- "time_embed.0.weight"]
- new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
- "time_embed.0.bias"]
- new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
- "time_embed.2.weight"]
- new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
- "time_embed.2.bias"]
-
- new_checkpoint["conv_in.weight"] = unet_state_dict[
- "input_blocks.0.0.weight"]
+ new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+ new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+ new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+ new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+ new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -540,35 +521,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
# Retrieves the keys for the input blocks only
- num_input_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "input_blocks" in layer
- })
+ num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
input_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
for layer_id in range(num_input_blocks)
}
# Retrieves the keys for the middle blocks only
- num_middle_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "middle_block" in layer
- })
+ num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
middle_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
for layer_id in range(num_middle_blocks)
}
# Retrieves the keys for the output blocks only
- num_output_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "output_blocks" in layer
- })
+ num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
output_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
for layer_id in range(num_output_blocks)
}
@@ -577,21 +546,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
resnets = [
- key for key in input_blocks[i]
- if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
- key
- ]
- attentions = [
- key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+ key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
]
+ attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.weight")
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.bias")
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.weight"
+ )
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.bias"
+ )
paths = renew_resnet_paths(resnets)
meta_path = {
@@ -603,7 +568,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if len(attentions):
paths = renew_attention_paths(attentions)
@@ -616,19 +582,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
resnet_0 = middle_blocks[0]
attentions = middle_blocks[1]
resnet_1 = middle_blocks[2]
resnet_0_paths = renew_resnet_paths(resnet_0)
- assign_to_checkpoint(
- resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
resnet_1_paths = renew_resnet_paths(resnet_1)
- assign_to_checkpoint(
- resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
attentions_paths = renew_attention_paths(attentions)
meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -637,14 +602,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
for i in range(num_output_blocks):
block_id = i // (config["layers_per_block"] + 1)
layer_in_block_id = i % (config["layers_per_block"] + 1)
- output_block_layers = [
- shave_segments(name, 2) for name in output_blocks[i]
- ]
+ output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
output_block_list = {}
for layer in output_block_layers:
@@ -655,12 +619,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
output_block_list[layer_id] = [layer_name]
if len(output_block_list) > 1:
- resnets = [
- key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
- ]
- attentions = [
- key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
- ]
+ resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+ attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
resnet_0_paths = renew_resnet_paths(resnets)
paths = renew_resnet_paths(resnets)
@@ -674,31 +634,30 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if ["conv.weight", "conv.bias"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.weight", "conv.bias"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
attentions = []
if ["conv.bias", "conv.weight"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.bias", "conv.weight"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
@@ -708,27 +667,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
paths = renew_attention_paths(attentions)
meta_path = {
"old": f"output_blocks.{i}.1",
- "new":
- f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+ "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
}
assign_to_checkpoint(
paths,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
else:
- resnet_0_paths = renew_resnet_paths(
- output_block_layers, n_shave_prefix_segments=1)
+ resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
for path in resnet_0_paths:
old_path = ".".join(["output_blocks", str(i), path["old"]])
- new_path = ".".join([
- "up_blocks",
- str(block_id),
- "resnets",
- str(layer_in_block_id),
- path["new"],
- ])
+ new_path = ".".join(
+ [
+ "up_blocks",
+ str(block_id),
+ "resnets",
+ str(layer_in_block_id),
+ path["new"],
+ ]
+ )
new_checkpoint[new_path] = unet_state_dict[old_path]
@@ -746,107 +706,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -854,58 +781,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -913,14 +832,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -955,7 +873,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
clip = {}
for key in checkpoint.keys():
if key.startswith("cond_stage_model.transformer"):
- clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key]
+ clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
new_model_state = {}
transformers2ppnlp = {
@@ -975,9 +893,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.items():
# step1: ignore position_ids
if any(i in name for i in ignore_value):
@@ -990,17 +906,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
new_model_state[name] = value.astype(dtype)
new_config = {
- "max_text_length":
- new_model_state["text_model.positional_embedding.weight"].shape[0],
- "vocab_size":
- new_model_state["text_model.token_embedding.weight"].shape[0],
- "text_embed_dim":
- new_model_state["text_model.token_embedding.weight"].shape[1],
+ "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0],
+ "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0],
+ "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1],
"text_heads": 12,
"text_layers": 12,
"text_hidden_act": "quick_gelu",
@@ -1019,7 +932,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--original_config_file",
default=None,
@@ -1045,13 +959,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
- ), )
+ ),
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
image_size = 512
@@ -1061,14 +977,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
if args.original_config_file is None:
get_path_from_url(
"https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/v1-inference.yaml",
- root_dir="./", )
+ root_dir="./",
+ )
args.original_config_file = "./v1-inference.yaml"
original_config = OmegaConf.load(args.original_config_file)
if args.num_in_channels is not None:
- original_config["model"]["params"]["unet_config"]["params"][
- "in_channels"] = args.num_in_channels
+ original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
num_train_timesteps = original_config.model.params.timesteps
beta_start = original_config.model.params.linear_start
@@ -1081,7 +997,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
num_train_timesteps=num_train_timesteps,
steps_offset=1,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
# make sure scheduler works correctly with DDIM
scheduler.register_to_config(clip_sample=False)
@@ -1096,44 +1013,37 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
elif args.scheduler_type == "euler":
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "euler-ancestral":
- scheduler = EulerAncestralDiscreteScheduler.from_config(
- scheduler.config)
+ scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
elif args.scheduler_type == "dpm":
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
elif args.scheduler_type == "ddim":
scheduler = scheduler
else:
- raise ValueError(
- f"Scheduler of type {args.scheduler_type} doesn't exist!")
+ raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
# 1. Convert the UNet2DConditionModel model.
- diffusers_unet_config = create_unet_diffusers_config(
- original_config, image_size=image_size)
+ diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
diffusers_unet_checkpoint = convert_ldm_unet_checkpoint(
checkpoint,
diffusers_unet_config,
path=args.checkpoint_path,
- extract_ema=args.extract_ema, )
+ extract_ema=args.extract_ema,
+ )
unet = UNet2DConditionModel.from_config(diffusers_unet_config)
- ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- unet, diffusers_unet_checkpoint)
+ ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
check_keys(unet, ppdiffusers_unet_checkpoint)
unet.load_dict(ppdiffusers_unet_checkpoint)
# 2. Convert the VAE model.
- vae_config = create_vae_diffusers_config(
- original_config, image_size=image_size)
- diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+ diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL.from_config(vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.load_dict(ppdiffusers_vae_checkpoint)
# 3. Convert the text_encoder model.
- text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
- checkpoint, dtype="float32")
+ text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32")
text_model = CLIPTextModel(CLIPTextConfig.from_dict(text_config))
text_model.eval()
check_keys(text_model, text_model_state_dict)
@@ -1150,5 +1060,6 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
scheduler=scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
index 96786f7bd3255..55fd755445702 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
@@ -27,10 +27,15 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from transformers import CLIPTextModel as HFCLIPTextModel
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
paddle.set_device("cpu")
@@ -60,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
new_item = new_item.replace("skip_connection", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -77,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
new_item = old_item
new_item = new_item.replace("nin_shortcut", "conv_shortcut")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -120,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
- new_item = shave_segments(
- new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+ new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
mapping.append({"old": old_item, "new": new_item})
@@ -129,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
def assign_to_checkpoint(
- paths,
- checkpoint,
- old_checkpoint,
- attention_paths_to_split=None,
- additional_replacements=None,
- config=None, ):
+ paths,
+ checkpoint,
+ old_checkpoint,
+ attention_paths_to_split=None,
+ additional_replacements=None,
+ config=None,
+):
"""
This does the final conversion step: take locally converted weights and apply a global renaming
to them. It splits attention layers, and takes into account additional replacements
that may arise.
Assigns the weights to the new checkpoint.
"""
- assert isinstance(
- paths,
- list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+ assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
# Splits the attention layers into three variables.
if attention_paths_to_split is not None:
@@ -151,13 +152,11 @@ def assign_to_checkpoint(
old_tensor = old_checkpoint[path]
channels = old_tensor.shape[0] // 3
- target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
- -1)
+ target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
- old_tensor = old_tensor.reshape((num_heads, 3 * channels //
- num_heads) + old_tensor.shape[1:])
+ old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
query, key, value = old_tensor.split(channels // num_heads, dim=1)
checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -168,8 +167,7 @@ def assign_to_checkpoint(
new_path = path["new"]
# These have already been assigned
- if (attention_paths_to_split is not None and
- new_path in attention_paths_to_split):
+ if attention_paths_to_split is not None and new_path in attention_paths_to_split:
continue
# Global renaming happens here
@@ -179,8 +177,7 @@ def assign_to_checkpoint(
if additional_replacements is not None:
for replacement in additional_replacements:
- new_path = new_path.replace(replacement["old"],
- replacement["new"])
+ new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
if "proj_attn.weight" in new_path:
@@ -207,25 +204,19 @@ def create_unet_diffusers_config(original_config):
"""
unet_params = original_config.model.params.unet_config.params
- block_out_channels = [
- unet_params.model_channels * mult for mult in unet_params.channel_mult
- ]
+ block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
down_block_types = []
resolution = 1
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnDownBlock2D"
- if resolution in unet_params.attention_resolutions else
- "DownBlock2D")
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
down_block_types.append(block_type)
if i != len(block_out_channels) - 1:
resolution *= 2
up_block_types = []
for i in range(len(block_out_channels)):
- block_type = ("CrossAttnUpBlock2D"
- if resolution in unet_params.attention_resolutions else
- "UpBlock2D")
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
up_block_types.append(block_type)
resolution //= 2
@@ -242,7 +233,8 @@ def create_unet_diffusers_config(original_config):
block_out_channels=tuple(block_out_channels),
layers_per_block=unet_params.num_res_blocks,
cross_attention_dim=unet_params.context_dim,
- attention_head_dim=attention_head_dim, )
+ attention_head_dim=attention_head_dim,
+ )
return config
@@ -266,14 +258,12 @@ def create_vae_diffusers_config(original_config):
up_block_types=tuple(up_block_types),
block_out_channels=tuple(block_out_channels),
latent_channels=vae_params.z_channels,
- layers_per_block=vae_params.num_res_blocks, )
+ layers_per_block=vae_params.num_res_blocks,
+ )
return config
-def convert_ldm_unet_checkpoint(checkpoint,
- config,
- path=None,
- extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
"""
Takes a state dict and a config, and returns a converted checkpoint.
"""
@@ -294,8 +284,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
for key in keys:
if key.startswith("model.diffusion_model"):
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
- unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
- flat_ema_key)
+ unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
print(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -308,17 +297,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint = {}
- new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
- "time_embed.0.weight"]
- new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
- "time_embed.0.bias"]
- new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
- "time_embed.2.weight"]
- new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
- "time_embed.2.bias"]
-
- new_checkpoint["conv_in.weight"] = unet_state_dict[
- "input_blocks.0.0.weight"]
+ new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+ new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+ new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+ new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+ new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -327,35 +311,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
# Retrieves the keys for the input blocks only
- num_input_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "input_blocks" in layer
- })
+ num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
input_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
for layer_id in range(num_input_blocks)
}
# Retrieves the keys for the middle blocks only
- num_middle_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "middle_block" in layer
- })
+ num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
middle_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
for layer_id in range(num_middle_blocks)
}
# Retrieves the keys for the output blocks only
- num_output_blocks = len({
- ".".join(layer.split(".")[:2])
- for layer in unet_state_dict if "output_blocks" in layer
- })
+ num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
output_blocks = {
- layer_id:
- [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+ layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
for layer_id in range(num_output_blocks)
}
@@ -364,21 +336,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
resnets = [
- key for key in input_blocks[i]
- if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
- key
- ]
- attentions = [
- key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+ key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
]
+ attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.weight")
- new_checkpoint[
- f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
- f"input_blocks.{i}.0.op.bias")
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.weight"
+ )
+ new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+ f"input_blocks.{i}.0.op.bias"
+ )
paths = renew_resnet_paths(resnets)
meta_path = {
@@ -390,7 +358,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if len(attentions):
paths = renew_attention_paths(attentions)
@@ -403,19 +372,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
resnet_0 = middle_blocks[0]
attentions = middle_blocks[1]
resnet_1 = middle_blocks[2]
resnet_0_paths = renew_resnet_paths(resnet_0)
- assign_to_checkpoint(
- resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
resnet_1_paths = renew_resnet_paths(resnet_1)
- assign_to_checkpoint(
- resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+ assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
attentions_paths = renew_attention_paths(attentions)
meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -424,14 +392,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
for i in range(num_output_blocks):
block_id = i // (config["layers_per_block"] + 1)
layer_in_block_id = i % (config["layers_per_block"] + 1)
- output_block_layers = [
- shave_segments(name, 2) for name in output_blocks[i]
- ]
+ output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
output_block_list = {}
for layer in output_block_layers:
@@ -442,12 +409,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
output_block_list[layer_id] = [layer_name]
if len(output_block_list) > 1:
- resnets = [
- key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
- ]
- attentions = [
- key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
- ]
+ resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+ attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
resnet_0_paths = renew_resnet_paths(resnets)
paths = renew_resnet_paths(resnets)
@@ -461,17 +424,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
if ["conv.weight", "conv.bias"] in output_block_list.values():
- index = list(output_block_list.values()).index(
- ["conv.weight", "conv.bias"])
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.weight"]
- new_checkpoint[
- f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
- f"output_blocks.{i}.{index}.conv.bias"]
+ index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.weight"
+ ]
+ new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+ f"output_blocks.{i}.{index}.conv.bias"
+ ]
# Clear attentions as they have been attributed above.
if len(attentions) == 2:
@@ -481,27 +444,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
paths = renew_attention_paths(attentions)
meta_path = {
"old": f"output_blocks.{i}.1",
- "new":
- f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+ "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
}
assign_to_checkpoint(
paths,
new_checkpoint,
unet_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
else:
- resnet_0_paths = renew_resnet_paths(
- output_block_layers, n_shave_prefix_segments=1)
+ resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
for path in resnet_0_paths:
old_path = ".".join(["output_blocks", str(i), path["old"]])
- new_path = ".".join([
- "up_blocks",
- str(block_id),
- "resnets",
- str(layer_in_block_id),
- path["new"],
- ])
+ new_path = ".".join(
+ [
+ "up_blocks",
+ str(block_id),
+ "resnets",
+ str(layer_in_block_id),
+ path["new"],
+ ]
+ )
new_checkpoint[new_path] = unet_state_dict[old_path]
@@ -519,107 +483,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint = {}
- new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
- "encoder.conv_in.weight"]
- new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
- "encoder.conv_in.bias"]
- new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
- "encoder.conv_out.weight"]
- new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
- "encoder.conv_out.bias"]
- new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
- "encoder.norm_out.weight"]
- new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
- "encoder.norm_out.bias"]
-
- new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
- "decoder.conv_in.weight"]
- new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
- "decoder.conv_in.bias"]
- new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
- "decoder.conv_out.weight"]
- new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
- "decoder.conv_out.bias"]
- new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
- "decoder.norm_out.weight"]
- new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
- "decoder.norm_out.bias"]
+ new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+ new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+ new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+ new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+ new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+ new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+ new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+ new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+ new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+ new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+ new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+ new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
- new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
- "post_quant_conv.weight"]
- new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
- "post_quant_conv.bias"]
+ new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+ new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
# Retrieves the keys for the encoder down blocks only
- num_down_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "encoder.down" in layer
- })
+ num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
down_blocks = {
- layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
- for layer_id in range(num_down_blocks)
+ layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
}
# Retrieves the keys for the decoder up blocks only
- num_up_blocks = len({
- ".".join(layer.split(".")[:3])
- for layer in vae_state_dict if "decoder.up" in layer
- })
+ num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
up_blocks = {
- layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
- for layer_id in range(num_up_blocks)
+ layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
}
for i in range(num_down_blocks):
- resnets = [
- key for key in down_blocks[i]
- if f"down.{i}" in key and f"down.{i}.downsample" not in key
- ]
+ resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.weight")
- new_checkpoint[
- f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
- f"encoder.down.{i}.downsample.conv.bias")
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.weight"
+ )
+ new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+ f"encoder.down.{i}.downsample.conv.bias"
+ )
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"down.{i}.block",
- "new": f"down_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"encoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "encoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -627,58 +558,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
for i in range(num_up_blocks):
block_id = num_up_blocks - 1 - i
resnets = [
- key for key in up_blocks[block_id]
- if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+ key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
]
if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.weight"]
- new_checkpoint[
- f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
- f"decoder.up.{block_id}.upsample.conv.bias"]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.weight"
+ ]
+ new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+ f"decoder.up.{block_id}.upsample.conv.bias"
+ ]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"up.{block_id}.block",
- "new": f"up_blocks.{i}.resnets"
- }
+ meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
num_mid_res_blocks = 2
for i in range(1, num_mid_res_blocks + 1):
- resnets = [
- key for key in mid_resnets if f"decoder.mid.block_{i}" in key
- ]
+ resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
paths = renew_vae_resnet_paths(resnets)
- meta_path = {
- "old": f"mid.block_{i}",
- "new": f"mid_block.resnets.{i - 1}"
- }
+ meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
assign_to_checkpoint(
paths,
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
- mid_attentions = [
- key for key in vae_state_dict if "decoder.mid.attn" in key
- ]
+ mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
paths = renew_vae_attention_paths(mid_attentions)
meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
assign_to_checkpoint(
@@ -686,14 +609,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
new_checkpoint,
vae_state_dict,
additional_replacements=[meta_path],
- config=config, )
+ config=config,
+ )
conv_attn_to_linear(new_checkpoint)
return new_checkpoint
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
- diffusers_vae_unet_checkpoint,
- dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
need_transpose = []
for k, v in vae_or_unet.named_sublayers(include_self=True):
if isinstance(v, paddle.nn.Linear):
@@ -745,9 +667,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
".vision_model.": ".",
}
ignore_value = ["position_ids"]
- donot_transpose = [
- "embeddings", "norm", "concept_embeds", "special_care_embeds"
- ]
+ donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
for name, value in clip.state_dict().items():
if f".{layer_need_to_ignore}." in name:
continue
@@ -762,7 +682,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
name = name.replace(hf_name, ppnlp_name)
# step4: 0d tensor -> 1d tensor
if name == "logit_scale":
- value = value.reshape((1, ))
+ value = value.reshape((1,))
# step5: safety_checker need prefix "clip."
new_model_state[name] = value.cpu().numpy().astype(dtype)
@@ -788,7 +708,8 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
default=None,
type=str,
required=True,
- help="Path to the checkpoint to convert.", )
+ help="Path to the checkpoint to convert.",
+ )
parser.add_argument(
"--original_config_file",
default="v2-inference.yaml",
@@ -809,13 +730,15 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
- ), )
+ ),
+ )
parser.add_argument(
"--dump_path",
default=None,
type=str,
required=True,
- help="Path to the output model.", )
+ help="Path to the output model.",
+ )
args = parser.parse_args()
@@ -836,26 +759,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
checkpoint,
diffusers_unet_config,
path=args.checkpoint_path,
- extract_ema=args.extract_ema, )
+ extract_ema=args.extract_ema,
+ )
unet = UNet2DConditionModel(**diffusers_unet_config)
- ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- unet, diffusers_unet_checkpoint)
+ ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
check_keys(unet, ppdiffusers_unet_checkpoint)
unet.load_dict(ppdiffusers_unet_checkpoint)
# 2. Convert the VAE model.
vae_config = create_vae_diffusers_config(original_config)
- diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
- vae_config)
+ diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config)
- ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
- vae, diffusers_vae_checkpoint)
+ ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
check_keys(vae, ppdiffusers_vae_checkpoint)
vae.load_dict(ppdiffusers_vae_checkpoint)
# 3. Convert the text model.
- text_model_type = original_config.model.params.cond_stage_config.target.split(
- ".")[-1]
+ text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
layer = original_config.model.params.cond_stage_config.params.layer
if layer == "last":
layer_idx = 0
@@ -867,19 +787,16 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
if text_model_type != "FrozenOpenCLIPEmbedder":
print("We only support FrozenOpenCLIPEmbedder as text_encoder!")
- clip = HFCLIPTextModel.from_pretrained(
- "laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
- ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(
- clip, layer_idx)
+ clip = HFCLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+ ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(clip, layer_idx)
text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(clip_config))
text_encoder.load_dict(ppdiffusers_clip_checkpoint)
# 5. load tokenizer.
pp_tokenizer = CLIPTokenizer.from_pretrained(
- "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
- pad_token="!",
- model_max_length=77)
+ "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", pad_token="!", model_max_length=77
+ )
# 6. Convert scheduler.
num_train_timesteps = original_config.model.params.timesteps
@@ -894,17 +811,14 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
set_alpha_to_one=False,
steps_offset=1,
# Make sure the scheduler compatible with PNDM
- skip_prk_steps=True, )
+ skip_prk_steps=True,
+ )
elif args.scheduler_type == "lms":
- scheduler = LMSDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
elif args.scheduler_type == "euler-ancestral":
scheduler = EulerAncestralDiscreteScheduler(
- beta_start=beta_start,
- beta_end=beta_end,
- beta_schedule="scaled_linear")
+ beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+ )
elif args.scheduler_type == "ddim":
scheduler = DDIMScheduler(
beta_start=beta_start,
@@ -913,10 +827,10 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
# Make sure the scheduler compatible with DDIM
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
else:
- raise ValueError(
- f"Scheduler of type {args.scheduler_type} doesn't exist!")
+ raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
pipe = StableDiffusionPipeline(
vae=vae,
@@ -926,6 +840,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
scheduler=scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
index 7caddb24c95d2..b7bed2a4b3b35 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
@@ -19,22 +19,20 @@
import paddle
-from ppdiffusers import (FastDeployStableDiffusionInpaintPipeline,
- FastDeployStableDiffusionMegaPipeline,
- StableDiffusionPipeline)
+from ppdiffusers import (
+ FastDeployStableDiffusionInpaintPipeline,
+ FastDeployStableDiffusionMegaPipeline,
+ StableDiffusionPipeline,
+)
from ppdiffusers.fastdeploy_utils import FastDeployRuntimeModel
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
- output_path: str,
- mode: bool=False):
- pipeline = StableDiffusionPipeline.from_pretrained(
- model_path, safety_checker=None, feature_extractor=None)
+def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, output_path: str, mode: bool = False):
+ pipeline = StableDiffusionPipeline.from_pretrained(model_path, safety_checker=None, feature_extractor=None)
output_path = Path(output_path)
# get arguments
- cross_attention_dim = (
- pipeline.unet.config.cross_attention_dim) # 768 or 1024 or 1280
+ cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280
unet_channels = pipeline.unet.config.in_channels # 4 or 9
vae_in_channels = pipeline.vae.config.in_channels # 3
vae_latent_channels = pipeline.vae.config.latent_channels # 4
@@ -42,14 +40,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
f"cross_attention_dim: {cross_attention_dim}\n",
f"unet_in_channels: {unet_channels}\n",
f"vae_encoder_in_channels: {vae_in_channels}\n",
- f"vae_decoder_latent_channels: {vae_latent_channels}", )
+ f"vae_decoder_latent_channels: {vae_latent_channels}",
+ )
# 1. Convert text_encoder
text_encoder = paddle.jit.to_static(
pipeline.text_encoder,
- input_spec=[
- paddle.static.InputSpec(
- shape=[None, None], dtype="int64", name="input_ids")
- ], # input_ids
+ input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids
)
save_path = os.path.join(args.output_path, "text_encoder", "inference")
paddle.jit.save(text_encoder, save_path)
@@ -60,17 +56,15 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
unet = paddle.jit.to_static(
pipeline.unet,
input_spec=[
- paddle.static.InputSpec(
- shape=[None, unet_channels, None, None],
- dtype="float32",
- name="sample"), # sample
- paddle.static.InputSpec(
- shape=[1], dtype="int64", name="timestep"), # timestep
+ paddle.static.InputSpec(shape=[None, unet_channels, None, None], dtype="float32", name="sample"), # sample
+ paddle.static.InputSpec(shape=[1], dtype="int64", name="timestep"), # timestep
paddle.static.InputSpec(
shape=[None, None, cross_attention_dim],
dtype="float32",
- name="encoder_hidden_states", ), # encoder_hidden_states
- ], )
+ name="encoder_hidden_states",
+ ), # encoder_hidden_states
+ ],
+ )
save_path = os.path.join(args.output_path, "unet", "inference")
paddle.jit.save(unet, save_path)
print(f"Save unet model in {save_path} successfully.")
@@ -87,8 +81,7 @@ def forward_vae_encoder_sample(self, z):
if mode:
vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
else:
- vae_encoder.forward = MethodType(forward_vae_encoder_sample,
- vae_encoder)
+ vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
vae_encoder = paddle.jit.to_static(
vae_encoder,
@@ -98,7 +91,8 @@ def forward_vae_encoder_sample(self, z):
dtype="float32",
name="sample", # N, C, H, W
), # latent
- ], )
+ ],
+ )
# Save vae_encoder in static graph model.
save_path = os.path.join(args.output_path, "vae_encoder", "inference")
paddle.jit.save(vae_encoder, save_path)
@@ -117,8 +111,10 @@ def forward_vae_decoder(self, z):
paddle.static.InputSpec(
shape=[None, vae_latent_channels, None, None],
dtype="float32",
- name="latent_sample", ), # latent_sample
- ], )
+ name="latent_sample",
+ ), # latent_sample
+ ],
+ )
# Save vae_decoder in static graph model.
save_path = os.path.join(args.output_path, "vae_decoder", "inference")
paddle.jit.save(vae_decoder, save_path)
@@ -131,18 +127,16 @@ def forward_vae_decoder(self, z):
fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
fastdeploy_pipeline = fd_pipe_cls(
- vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
- "vae_encoder"),
- vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path /
- "vae_decoder"),
- text_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
- "text_encoder"),
+ vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+ vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+ text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
tokenizer=pipeline.tokenizer,
scheduler=pipeline.scheduler,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
fastdeploy_pipeline.save_pretrained(output_path)
print("FastDeploy pipeline saved to", output_path)
@@ -174,17 +168,13 @@ def forward_vae_decoder(self, z):
required=True,
help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
)
- parser.add_argument(
- "--output_path",
- type=str,
- required=True,
- help="Path to the output model.")
+ parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
parser.add_argument(
"--mode",
action="store_true",
default=False,
- help="Export the vae encoder in mode or sample", )
+ help="Export the vae encoder in mode or sample",
+ )
args = parser.parse_args()
- convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
- args.model_path, args.output_path, args.mode)
+ convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(args.model_path, args.output_path, args.mode)
diff --git a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
index e8def2f35e60a..6a27ffff944e8 100644
--- a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
+++ b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
@@ -53,9 +53,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
all_text_embeds = []
all_image_embeds = []
for text, image_path in tqdm(
- zip(
- batchify(texts, batch_size), batchify(images_path, batch_size)),
- total=math.ceil(len(texts) / batch_size), ):
+ zip(batchify(texts, batch_size), batchify(images_path, batch_size)),
+ total=math.ceil(len(texts) / batch_size),
+ ):
assert len(text) == len(image_path)
batch_inputs = processor(
text=text,
@@ -63,56 +63,52 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
return_tensors="pd",
max_length=processor.tokenizer.model_max_length,
padding="max_length",
- truncation=True, )
- text_embeds = model.get_text_features(
- input_ids=batch_inputs["input_ids"])
- image_embeds = model.get_image_features(
- pixel_values=batch_inputs["pixel_values"])
+ truncation=True,
+ )
+ text_embeds = model.get_text_features(input_ids=batch_inputs["input_ids"])
+ image_embeds = model.get_image_features(pixel_values=batch_inputs["pixel_values"])
all_text_embeds.append(text_embeds)
all_image_embeds.append(image_embeds)
all_text_embeds = paddle.concat(all_text_embeds)
all_image_embeds = paddle.concat(all_image_embeds)
- all_text_embeds = all_text_embeds / all_text_embeds.norm(
- axis=-1, keepdim=True)
- all_image_embeds = all_image_embeds / all_image_embeds.norm(
- axis=-1, keepdim=True)
- clip_score = (all_image_embeds *
- all_text_embeds).sum(-1) * model.logit_scale.exp()
+ all_text_embeds = all_text_embeds / all_text_embeds.norm(axis=-1, keepdim=True)
+ all_image_embeds = all_image_embeds / all_image_embeds.norm(axis=-1, keepdim=True)
+ clip_score = (all_image_embeds * all_text_embeds).sum(-1) * model.logit_scale.exp()
return clip_score
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument(
- "--image_path", default=None, nargs="+", type=str, help="image_path")
+ parser.add_argument("--image_path", default=None, nargs="+", type=str, help="image_path")
parser.add_argument(
"--output_file",
default="statistic_results.json",
type=str,
- help="output file name", )
+ help="output file name",
+ )
parser.add_argument(
"--text_file_name",
default="coco30k",
choices=["coco1k", "coco10k", "coco30k"],
type=str,
- help="text file.", )
+ help="text file.",
+ )
parser.add_argument(
"--clip_model_name_or_path",
default="openai/clip-vit-base-patch32",
type=str,
- help="clip_model_name_or_path", )
- parser.add_argument(
- "--fid_batch_size", default=32, type=int, help="fid_batch_size")
- parser.add_argument(
- "--clip_batch_size", default=64, type=int, help="clip_batch_size")
- parser.add_argument(
- "--resolution", default=256, type=int, help="resolution of images")
+ help="clip_model_name_or_path",
+ )
+ parser.add_argument("--fid_batch_size", default=32, type=int, help="fid_batch_size")
+ parser.add_argument("--clip_batch_size", default=64, type=int, help="clip_batch_size")
+ parser.add_argument("--resolution", default=256, type=int, help="resolution of images")
parser.add_argument("--device", default="gpu", type=str, help="device")
parser.add_argument(
"--only_fid",
action="store_true",
- help=("Only eval fid. "), )
+ help=("Only eval fid. "),
+ )
args = parser.parse_args()
paddle.set_device(args.device)
@@ -127,11 +123,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
else:
os.environ["FLAG_IMAGE_NUM"] = "1000"
dataset_name = f"coco_{args.resolution}_{image_num}.npz"
- fid_target_file = get_path_from_url(base_url + dataset_name,
- cache_path) + ".npz"
+ fid_target_file = get_path_from_url(base_url + dataset_name, cache_path) + ".npz"
- text_file = get_path_from_url(base_url + text_file_name + ".tsv",
- cache_path)
+ text_file = get_path_from_url(base_url + text_file_name + ".tsv", cache_path)
df = pd.read_csv(text_file, sep="\t")
texts = df["caption_en"].tolist()
if not args.only_fid:
@@ -149,18 +143,16 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
[fid_target_file, path],
batch_size=args.fid_batch_size,
dims=2048,
- num_workers=4, )
+ num_workers=4,
+ )
results["fid"].append(fid_value)
if not args.only_fid:
# clip score
- images_path = sorted([
- image_path
- for ext in IMAGE_EXTENSIONS
- for image_path in pathlib.Path(path).glob("*.{}".format(ext))
- ])
- clip_score = compute_clip_score(model, processor, texts,
- images_path, args.clip_batch_size)
+ images_path = sorted(
+ [image_path for ext in IMAGE_EXTENSIONS for image_path in pathlib.Path(path).glob("*.{}".format(ext))]
+ )
+ clip_score = compute_clip_score(model, processor, texts, images_path, args.clip_batch_size)
if "clip_score" not in results:
results["clip_score"] = []
_clip_score = clip_score.mean().item()
diff --git a/ppdiffusers/scripts/fid_clip_score/fid_score.py b/ppdiffusers/scripts/fid_clip_score/fid_score.py
index c73e4597015ad..9c6a81cb351c9 100755
--- a/ppdiffusers/scripts/fid_clip_score/fid_score.py
+++ b/ppdiffusers/scripts/fid_clip_score/fid_score.py
@@ -67,42 +67,37 @@ def tqdm(x):
from inception import InceptionV3
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
- "--batch-size", type=int, default=50, help="Batch size to use")
-parser.add_argument(
- "--resolution", type=int, default=None, help="The resolution to resize.")
+parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
+parser.add_argument("--resolution", type=int, default=None, help="The resolution to resize.")
parser.add_argument(
"--num-workers",
type=int,
- help=("Number of processes to use for data loading. "
- "Defaults to `min(8, num_cpus)`"), )
-parser.add_argument(
- "--device",
- type=str,
- default=None,
- help="Device to use. Like cuda, cuda:0 or cpu")
+ help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
+)
+parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
parser.add_argument(
"--dims",
type=int,
default=2048,
choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
- help=("Dimensionality of Inception features to use. "
- "By default, uses pool3 features"), )
+ help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
+)
parser.add_argument(
"--save-stats",
action="store_true",
- help=("Generate an npz archive from a directory of samples. "
- "The first path is used as input and the second as output."), )
+ help=(
+ "Generate an npz archive from a directory of samples. "
+ "The first path is used as input and the second as output."
+ ),
+)
parser.add_argument(
"path",
type=str,
nargs=2,
- help=("Paths to the generated images or "
- "to .npz statistic files"), )
+ help=("Paths to the generated images or " "to .npz statistic files"),
+)
-IMAGE_EXTENSIONS = {
- "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
class ImagePathDataset(paddle.io.Dataset):
@@ -125,12 +120,7 @@ def __getitem__(self, i):
return {"img": img}
-def get_activations(files,
- model,
- batch_size=50,
- dims=2048,
- num_workers=1,
- resolution=None):
+def get_activations(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
"""Calculates the activations of the pool_3 layer for all images.
Params:
@@ -152,18 +142,17 @@ def get_activations(files,
model.eval()
if batch_size > len(files):
- print(("Warning: batch size is bigger than the data size. "
- "Setting batch size to data size"))
+ print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
batch_size = len(files)
- dataset = ImagePathDataset(
- files, transforms=TF.ToTensor(), resolution=resolution)
+ dataset = ImagePathDataset(files, transforms=TF.ToTensor(), resolution=resolution)
dataloader = paddle.io.DataLoader(
dataset,
batch_size=batch_size,
shuffle=False,
drop_last=False,
- num_workers=num_workers, )
+ num_workers=num_workers,
+ )
pred_arr = np.empty((len(files), dims))
@@ -181,7 +170,7 @@ def get_activations(files,
pred = pred.squeeze(3).squeeze(2).cpu().numpy()
- pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+ pred_arr[start_idx : start_idx + pred.shape[0]] = pred
start_idx = start_idx + pred.shape[0]
@@ -216,18 +205,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
sigma1 = np.atleast_2d(sigma1)
sigma2 = np.atleast_2d(sigma2)
- assert (mu1.shape == mu2.shape
- ), "Training and test mean vectors have different lengths"
- assert (sigma1.shape == sigma2.shape
- ), "Training and test covariances have different dimensions"
+ assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+ assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
diff = mu1 - mu2
# Product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
- msg = ("fid calculation produces singular product; "
- "adding %s to diagonal of cov estimates") % eps
+ msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
print(msg)
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
@@ -244,12 +230,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
-def calculate_activation_statistics(files,
- model,
- batch_size=50,
- dims=2048,
- num_workers=1,
- resolution=None):
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
"""Calculation of the statistics used by the FID.
Params:
-- files : List of image files paths
@@ -266,43 +247,28 @@ def calculate_activation_statistics(files,
-- sigma : The covariance matrix of the activations of the pool_3 layer of
the inception model.
"""
- act = get_activations(
- files, model, batch_size, dims, num_workers, resolution=resolution)
+ act = get_activations(files, model, batch_size, dims, num_workers, resolution=resolution)
mu = np.mean(act, axis=0)
sigma = np.cov(act, rowvar=False)
return mu, sigma
-def compute_statistics_of_path(path,
- model,
- batch_size,
- dims,
- num_workers=1,
- resolution=None):
+def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1, resolution=None):
if path.endswith(".npz"):
with np.load(path) as f:
m, s = f["mu"][:], f["sigma"][:]
else:
path = pathlib.Path(path)
- files = sorted([
- file
- for ext in IMAGE_EXTENSIONS
- for file in path.glob("*.{}".format(ext))
- ])
+ files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
FLAG_IMAGE_NUM = os.getenv("FLAG_IMAGE_NUM", None)
if FLAG_IMAGE_NUM is not None:
- files = files[:int(FLAG_IMAGE_NUM)]
- m, s = calculate_activation_statistics(
- files, model, batch_size, dims, num_workers, resolution=resolution)
+ files = files[: int(FLAG_IMAGE_NUM)]
+ m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers, resolution=resolution)
return m, s
-def calculate_fid_given_paths(paths,
- batch_size,
- dims,
- num_workers=1,
- resolution=None):
+def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1, resolution=None):
"""Calculates the FID of two paths"""
for p in paths:
if not os.path.exists(p):
@@ -312,11 +278,9 @@ def calculate_fid_given_paths(paths,
model = InceptionV3([block_idx])
- m1, s1 = compute_statistics_of_path(
- paths[0], model, batch_size, dims, num_workers, resolution=resolution)
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
- m2, s2 = compute_statistics_of_path(
- paths[1], model, batch_size, dims, num_workers, resolution=resolution)
+ m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers, resolution=resolution)
fid_value = calculate_frechet_distance(m1, s1, m2, s2)
@@ -337,8 +301,7 @@ def save_fid_stats(paths, batch_size, dims, num_workers=1, resolution=None):
print(f"Saving statistics for {paths[0]}")
- m1, s1 = compute_statistics_of_path(
- paths[0], model, batch_size, dims, num_workers, resolution=resolution)
+ m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
np.savez_compressed(paths[1], mu=m1, sigma=s1)
@@ -367,15 +330,13 @@ def main():
args.batch_size,
args.dims,
num_workers,
- resolution=args.resolution, )
+ resolution=args.resolution,
+ )
return
fid_value = calculate_fid_given_paths(
- args.path,
- args.batch_size,
- args.dims,
- num_workers,
- resolution=args.resolution)
+ args.path, args.batch_size, args.dims, num_workers, resolution=args.resolution
+ )
print("FID: ", fid_value)
diff --git a/ppdiffusers/scripts/fid_clip_score/inception.py b/ppdiffusers/scripts/fid_clip_score/inception.py
index 9aecdf265779a..bbdff9a933432 100644
--- a/ppdiffusers/scripts/fid_clip_score/inception.py
+++ b/ppdiffusers/scripts/fid_clip_score/inception.py
@@ -21,7 +21,8 @@
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
FID_WEIGHTS_URL = (
"https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
- "8e2ae24c34c5c8b81d45167bb9361f4c", )
+ "8e2ae24c34c5c8b81d45167bb9361f4c",
+)
WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
@@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential):
"""
def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=None,
- groups=1,
- norm_layer=nn.BatchNorm2D,
- activation_layer=nn.ReLU,
- dilation=1,
- bias=None, ):
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=None,
+ groups=1,
+ norm_layer=nn.BatchNorm2D,
+ activation_layer=nn.ReLU,
+ dilation=1,
+ bias=None,
+ ):
if padding is None:
padding = (kernel_size - 1) // 2 * dilation
if bias is None:
@@ -71,7 +73,8 @@ def __init__(
padding,
dilation=dilation,
groups=groups,
- bias_attr=bias, )
+ bias_attr=bias,
+ )
]
if norm_layer is not None:
# The hyperparameter of BatchNorm2D is different from PaddlePaddle.
@@ -97,12 +100,13 @@ class InceptionV3(nn.Layer):
}
def __init__(
- self,
- output_blocks=(DEFAULT_BLOCK_INDEX, ),
- resize_input=True,
- normalize_input=True,
- requires_grad=False,
- use_fid_inception=True, ):
+ self,
+ output_blocks=(DEFAULT_BLOCK_INDEX,),
+ resize_input=True,
+ normalize_input=True,
+ requires_grad=False,
+ use_fid_inception=True,
+ ):
"""Build pretrained InceptionV3
Parameters
@@ -211,8 +215,7 @@ def forward(self, inp):
outp = []
x = inp
if self.resize_input:
- x = F.interpolate(
- x, size=(299, 299), mode="bilinear", align_corners=False)
+ x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
if self.normalize_input:
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
@@ -235,8 +238,7 @@ def hack_bn_layer(layer):
def _inception_v3(*args, **kwargs):
"""Wraps `paddle.vision.models.inception_v3`"""
- return paddle.vision.models.inception_v3(*args,
- **kwargs).apply(hack_bn_layer)
+ return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
def fid_inception_v3():
@@ -248,8 +250,7 @@ def fid_inception_v3():
This method first constructs paddle.vision's Inception and then patches the
necessary parts that are different in the FID Inception model.
"""
- inception = _inception_v3(
- num_classes=1008, with_pool=True, pretrained=False)
+ inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
inception.inception_block_list[0] = InceptionA(192, pool_features=32)
inception.inception_block_list[1] = InceptionA(256, pool_features=64)
inception.inception_block_list[2] = InceptionA(288, pool_features=64)
@@ -260,8 +261,7 @@ def fid_inception_v3():
inception.inception_block_list[9] = InceptionE_1(1280)
inception.inception_block_list[10] = InceptionE_2(2048)
- weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0],
- FID_WEIGHTS_URL[1])
+ weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
state_dict = paddle.load(weight_path)
inception.set_state_dict(state_dict)
return inception
@@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features):
out_channels=64,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch5x5_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=48,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch5x5_2 = ConvNormActivation(
in_channels=48,
out_channels=64,
kernel_size=5,
padding=2,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=64,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_2 = ConvNormActivation(
in_channels=64,
out_channels=96,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3 = ConvNormActivation(
in_channels=96,
out_channels=96,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=pool_features,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -330,8 +336,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
return x
@@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7):
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_1 = ConvNormActivation(
in_channels=num_channels,
@@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7):
kernel_size=1,
stride=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_2 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(1, 7),
stride=1,
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7_3 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=192,
kernel_size=(7, 1),
stride=1,
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=channels_7x7,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_2 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(7, 1),
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_3 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(1, 7),
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_4 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=channels_7x7,
kernel_size=(7, 1),
padding=(3, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch7x7dbl_5 = ConvNormActivation(
in_channels=channels_7x7,
out_channels=192,
kernel_size=(1, 7),
padding=(0, 3),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -424,8 +438,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
return x
@@ -438,61 +451,69 @@ def __init__(self, num_channels):
out_channels=320,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=384,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_2a = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(1, 3),
padding=(0, 1),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3_2b = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(3, 1),
padding=(1, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_1 = ConvNormActivation(
in_channels=num_channels,
out_channels=448,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_2 = ConvNormActivation(
in_channels=448,
out_channels=384,
kernel_size=3,
padding=1,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3a = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(1, 3),
padding=(0, 1),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
self.branch3x3dbl_3b = ConvNormActivation(
in_channels=384,
out_channels=384,
kernel_size=(3, 1),
padding=(1, 0),
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
- self.branch_pool = nn.AvgPool2D(
- kernel_size=3, stride=1, padding=1, exclusive=True)
+ self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
self.branch_pool_conv = ConvNormActivation(
in_channels=num_channels,
out_channels=192,
kernel_size=1,
padding=0,
- activation_layer=nn.ReLU, )
+ activation_layer=nn.ReLU,
+ )
def forward(self, x):
branch1x1 = self.branch1x1(x)
@@ -515,8 +536,7 @@ def forward(self, x):
branch_pool = self.branch_pool(x)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
return x
@@ -549,6 +569,5 @@ def forward(self, x):
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool_conv(branch_pool)
- x = paddle.concat(
- [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+ x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
return x
diff --git a/ppdiffusers/setup.py b/ppdiffusers/setup.py
index bb412f60fc4f4..a5d0f3cf3b5e9 100644
--- a/ppdiffusers/setup.py
+++ b/ppdiffusers/setup.py
@@ -57,10 +57,7 @@ def read_requirements():
keywords=["ppdiffusers", "paddle", "paddlemix"],
install_requires=REQUIRED_PACKAGES,
python_requires=">=3.6",
- entry_points={
- "console_scripts":
- ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]
- },
+ entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
classifiers=[
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
@@ -70,4 +67,5 @@ def read_requirements():
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
],
- license="Apache 2.0", )
+ license="Apache 2.0",
+)
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
index 7f987b99141b8..aa10a342c68d4 100644
--- a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
+++ b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
@@ -38,13 +38,14 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- generator: Optional[paddle.Generator]=None,
- num_inference_steps: int=50,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ generator: Optional[paddle.Generator] = None,
+ num_inference_steps: int = 50,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -74,8 +75,10 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, ),
- generator=generator, )
+ self.unet.config.sample_size,
+ ),
+ generator=generator,
+ )
# set step values
self.scheduler.set_timesteps(num_inference_steps)
@@ -95,6 +98,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, ), "This is a local test"
+ return (image,), "This is a local test"
return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
index d562cd9e580cc..ebdc7650dafd2 100644
--- a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
+++ b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
@@ -38,13 +38,14 @@ def __init__(self, unet, scheduler):
@paddle.no_grad()
def __call__(
- self,
- batch_size: int=1,
- generator: Optional[paddle.Generator]=None,
- num_inference_steps: int=50,
- output_type: Optional[str]="pil",
- return_dict: bool=True,
- **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+ self,
+ batch_size: int = 1,
+ generator: Optional[paddle.Generator] = None,
+ num_inference_steps: int = 50,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):
@@ -74,8 +75,10 @@ def __call__(
batch_size,
self.unet.config.in_channels,
self.unet.config.sample_size,
- self.unet.config.sample_size, ),
- generator=generator, )
+ self.unet.config.sample_size,
+ ),
+ generator=generator,
+ )
# set step values
self.scheduler.set_timesteps(num_inference_steps)
@@ -95,6 +98,6 @@ def __call__(
image = self.numpy_to_pil(image)
if not return_dict:
- return (image, ), "This is a local test"
+ return (image,), "This is a local test"
return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/models/test_attention_processor.py b/ppdiffusers/tests/models/test_attention_processor.py
index 84b2d1e9263cb..f47ddfa4abb1d 100644
--- a/ppdiffusers/tests/models/test_attention_processor.py
+++ b/ppdiffusers/tests/models/test_attention_processor.py
@@ -16,12 +16,11 @@
import paddle
-from ppdiffusers.models.attention_processor import (Attention,
- AttnAddedKVProcessor)
+from ppdiffusers.models.attention_processor import Attention, AttnAddedKVProcessor
class AttnAddedKVProcessorTests(unittest.TestCase):
- def get_constructor_arguments(self, only_cross_attention: bool=False):
+ def get_constructor_arguments(self, only_cross_attention: bool = False):
query_dim = 10
if only_cross_attention:
@@ -59,8 +58,7 @@ def test_only_cross_attention(self):
paddle.seed(0)
- constructor_args = self.get_constructor_arguments(
- only_cross_attention=False)
+ constructor_args = self.get_constructor_arguments(only_cross_attention=False)
attn = Attention(**constructor_args)
self.assertTrue(attn.to_k is not None)
@@ -68,7 +66,8 @@ def test_only_cross_attention(self):
forward_args = self.get_forward_arguments(
query_dim=constructor_args["query_dim"],
- added_kv_proj_dim=constructor_args["added_kv_proj_dim"], )
+ added_kv_proj_dim=constructor_args["added_kv_proj_dim"],
+ )
self_and_cross_attn_out = attn(**forward_args)
@@ -76,8 +75,7 @@ def test_only_cross_attention(self):
paddle.seed(0)
- constructor_args = self.get_constructor_arguments(
- only_cross_attention=True)
+ constructor_args = self.get_constructor_arguments(only_cross_attention=True)
attn = Attention(**constructor_args)
self.assertTrue(attn.to_k is None)
@@ -85,7 +83,8 @@ def test_only_cross_attention(self):
forward_args = self.get_forward_arguments(
query_dim=constructor_args["query_dim"],
- added_kv_proj_dim=constructor_args["added_kv_proj_dim"], )
+ added_kv_proj_dim=constructor_args["added_kv_proj_dim"],
+ )
only_cross_attn_out = attn(**forward_args)
diff --git a/ppdiffusers/tests/models/test_layers_utils.py b/ppdiffusers/tests/models/test_layers_utils.py
index 6bfcd5b37fbab..32480c6e215df 100644
--- a/ppdiffusers/tests/models/test_layers_utils.py
+++ b/ppdiffusers/tests/models/test_layers_utils.py
@@ -19,8 +19,12 @@
import paddle
import paddle.nn
-from ppdiffusers.models.attention import (GEGLU, AdaLayerNorm, ApproximateGELU,
- AttentionBlock)
+from ppdiffusers.models.attention import (
+ GEGLU,
+ AdaLayerNorm,
+ ApproximateGELU,
+ AttentionBlock,
+)
from ppdiffusers.models.embeddings import get_timestep_embedding
from ppdiffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
from ppdiffusers.models.transformer_2d import Transformer2DModel
@@ -31,8 +35,8 @@ def test_timestep_embeddings(self):
embedding_dim = 256
timesteps = paddle.arange(start=16)
t1 = get_timestep_embedding(timesteps, embedding_dim)
- assert (t1[0, :embedding_dim // 2] - 0).abs().sum() < 1e-05
- assert (t1[0, embedding_dim // 2:] - 1).abs().sum() < 1e-05
+ assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-05
+ assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-05
assert (t1[:, -1] - 1).abs().sum() < 1e-05
grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
prev_grad = 0.0
@@ -49,72 +53,59 @@ def test_timestep_defaults(self):
embedding_dim,
flip_sin_to_cos=False,
downscale_freq_shift=1,
- max_period=10000, )
+ max_period=10000,
+ )
assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
def test_timestep_flip_sin_cos(self):
embedding_dim = 16
timesteps = paddle.arange(start=10)
- t1 = get_timestep_embedding(
- timesteps, embedding_dim, flip_sin_to_cos=True)
- t1 = paddle.concat(
- x=[t1[:, embedding_dim // 2:], t1[:, :embedding_dim // 2]], axis=-1)
- t2 = get_timestep_embedding(
- timesteps, embedding_dim, flip_sin_to_cos=False)
+ t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
+ t1 = paddle.concat(x=[t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], axis=-1)
+ t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
def test_timestep_downscale_freq_shift(self):
embedding_dim = 16
timesteps = paddle.arange(start=10)
- t1 = get_timestep_embedding(
- timesteps, embedding_dim, downscale_freq_shift=0)
- t2 = get_timestep_embedding(
- timesteps, embedding_dim, downscale_freq_shift=1)
- cosine_half = (t1 - t2)[:, embedding_dim // 2:]
+ t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
+ t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
+ cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-05
def test_sinoid_embeddings_hardcoded(self):
embedding_dim = 64
timesteps = paddle.arange(start=128)
- t1 = get_timestep_embedding(
- timesteps,
- embedding_dim,
- downscale_freq_shift=1,
- flip_sin_to_cos=False)
- t2 = get_timestep_embedding(
- timesteps,
- embedding_dim,
- downscale_freq_shift=0,
- flip_sin_to_cos=True)
+ t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
+ t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
assert paddle.allclose(
t1[23:26, 47:50].flatten().cpu(),
- paddle.to_tensor([
- 0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769,
- 0.9872
- ]),
- atol=0.01, )
+ paddle.to_tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
+ atol=0.01,
+ )
assert paddle.allclose(
t2[23:26, 47:50].flatten().cpu(),
- paddle.to_tensor([
- 0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474,
- 0.1864
- ]),
- atol=0.01, )
+ paddle.to_tensor([0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474, 0.1864]),
+ atol=0.01,
+ )
assert paddle.allclose(
t3[23:26, 47:50].flatten().cpu(),
- paddle.to_tensor([
- -0.9801,
- -0.9464,
- -0.9349,
- -0.3952,
- 0.8887,
- -0.9709,
- 0.5299,
- -0.2853,
- -0.9927,
- ]),
- atol=0.01, )
+ paddle.to_tensor(
+ [
+ -0.9801,
+ -0.9464,
+ -0.9349,
+ -0.3952,
+ 0.8887,
+ -0.9709,
+ 0.5299,
+ -0.2853,
+ -0.9927,
+ ]
+ ),
+ atol=0.01,
+ )
class Upsample2DBlockTests(unittest.TestCase):
@@ -126,19 +117,20 @@ def test_upsample_default(self):
upsampled = upsample(sample)
assert tuple(upsampled.shape) == (1, 32, 64, 64)
output_slice = upsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -1.50215650,
- -0.12905766,
- -0.12905766,
- -1.97015178,
- 0.78776687,
- 0.78776687,
- -1.97015178,
- 0.78776687,
- 0.78776687,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -1.50215650,
+ -0.12905766,
+ -0.12905766,
+ -1.97015178,
+ 0.78776687,
+ 0.78776687,
+ -1.97015178,
+ 0.78776687,
+ 0.78776687,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_upsample_with_conv(self):
paddle.seed(0)
@@ -148,19 +140,20 @@ def test_upsample_with_conv(self):
upsampled = upsample(sample)
assert tuple(upsampled.shape) == (1, 32, 64, 64)
output_slice = upsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 0.4583871364593506,
- -0.8221798539161682,
- -0.8228907585144043,
- 0.3325321078300476,
- -0.24422502517700195,
- 1.344732642173767,
- 0.5239212512969971,
- -0.4814918637275696,
- 0.17928099632263184,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 0.4583871364593506,
+ -0.8221798539161682,
+ -0.8228907585144043,
+ 0.3325321078300476,
+ -0.24422502517700195,
+ 1.344732642173767,
+ 0.5239212512969971,
+ -0.4814918637275696,
+ 0.17928099632263184,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_upsample_with_conv_out_dim(self):
paddle.seed(0)
@@ -170,42 +163,43 @@ def test_upsample_with_conv_out_dim(self):
upsampled = upsample(sample)
assert tuple(upsampled.shape) == (1, 64, 64, 64)
output_slice = upsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 0.9049283266067505,
- -1.6125869750976562,
- -1.0837469100952148,
- 0.24520659446716309,
- -0.6669139266014099,
- 0.5660533905029297,
- 1.1056761741638184,
- 2.1717309951782227,
- 0.7197026610374451,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 0.9049283266067505,
+ -1.6125869750976562,
+ -1.0837469100952148,
+ 0.24520659446716309,
+ -0.6669139266014099,
+ 0.5660533905029297,
+ 1.1056761741638184,
+ 2.1717309951782227,
+ 0.7197026610374451,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_upsample_with_transpose(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 32, 32, 32])
- upsample = Upsample2D(
- channels=32, use_conv=False, use_conv_transpose=True)
+ upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True)
with paddle.no_grad():
upsampled = upsample(sample)
assert tuple(upsampled.shape) == (1, 32, 64, 64)
output_slice = upsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.05951342731714249,
- 0.26951998472213745,
- 0.2600363492965698,
- 1.12237548828125,
- -0.07744798064231873,
- 0.006375734228640795,
- 0.6678807735443115,
- 0.44324278831481934,
- -0.10978640615940094,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.05951342731714249,
+ 0.26951998472213745,
+ 0.2600363492965698,
+ 1.12237548828125,
+ -0.07744798064231873,
+ 0.006375734228640795,
+ 0.6678807735443115,
+ 0.44324278831481934,
+ -0.10978640615940094,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
class Downsample2DBlockTests(unittest.TestCase):
@@ -217,17 +211,19 @@ def test_downsample_default(self):
downsampled = downsample(sample)
assert tuple(downsampled.shape) == (1, 32, 32, 32)
output_slice = downsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.24012964963912964,
- -0.034197285771369934,
- -1.0328047275543213,
- 0.7861506938934326,
- -0.2086063176393509,
- -0.3999312222003937,
- 0.25081655383110046,
- -0.23891538381576538,
- -1.4398303031921387,
- ])
+ expected_slice = paddle.to_tensor(
+ [
+ -0.24012964963912964,
+ -0.034197285771369934,
+ -1.0328047275543213,
+ 0.7861506938934326,
+ -0.2086063176393509,
+ -0.3999312222003937,
+ 0.25081655383110046,
+ -0.23891538381576538,
+ -1.4398303031921387,
+ ]
+ )
max_diff = (output_slice.flatten() - expected_slice).abs().sum().item()
assert max_diff <= 0.001
@@ -239,19 +235,20 @@ def test_downsample_with_conv(self):
downsampled = downsample(sample)
assert tuple(downsampled.shape) == (1, 32, 32, 32)
output_slice = downsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.009430217556655407,
- 0.8657761216163635,
- 1.7985490560531616,
- -0.61894291639328,
- -2.5752196311950684,
- 1.2352519035339355,
- 0.6046919822692871,
- -1.6499173641204834,
- -1.5272349119186401,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.009430217556655407,
+ 0.8657761216163635,
+ 1.7985490560531616,
+ -0.61894291639328,
+ -2.5752196311950684,
+ 1.2352519035339355,
+ 0.6046919822692871,
+ -1.6499173641204834,
+ -1.5272349119186401,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_downsample_with_conv_pad1(self):
paddle.seed(0)
@@ -261,19 +258,20 @@ def test_downsample_with_conv_pad1(self):
downsampled = downsample(sample)
assert tuple(downsampled.shape) == (1, 32, 32, 32)
output_slice = downsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.009430217556655407,
- 0.8657761216163635,
- 1.7985490560531616,
- -0.61894291639328,
- -2.5752196311950684,
- 1.2352519035339355,
- 0.6046919822692871,
- -1.6499173641204834,
- -1.5272349119186401,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.009430217556655407,
+ 0.8657761216163635,
+ 1.7985490560531616,
+ -0.61894291639328,
+ -2.5752196311950684,
+ 1.2352519035339355,
+ 0.6046919822692871,
+ -1.6499173641204834,
+ -1.5272349119186401,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_downsample_with_conv_out_dim(self):
paddle.seed(0)
@@ -283,19 +281,20 @@ def test_downsample_with_conv_out_dim(self):
downsampled = downsample(sample)
assert tuple(downsampled.shape) == (1, 16, 32, 32)
output_slice = downsampled[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 0.10819266736507416,
- 0.43043053150177,
- -0.7322822213172913,
- -1.923148512840271,
- 1.0195047855377197,
- 0.48796477913856506,
- 1.6765365600585938,
- -4.072991847991943,
- 0.8763526082038879,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 0.10819266736507416,
+ 0.43043053150177,
+ -0.7322822213172913,
+ -1.923148512840271,
+ 1.0195047855377197,
+ 0.48796477913856506,
+ 1.6765365600585938,
+ -4.072991847991943,
+ 0.8763526082038879,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
class ResnetBlock2DTests(unittest.TestCase):
@@ -308,43 +307,44 @@ def test_resnet_default(self):
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 64, 64)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 1.9816107749938965,
- 1.4443503618240356,
- -1.0354782342910767,
- 0.23985600471496582,
- -1.0868161916732788,
- -1.5830397605895996,
- -0.041037797927856445,
- -1.2574901580810547,
- -0.5504958629608154,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 1.9816107749938965,
+ 1.4443503618240356,
+ -1.0354782342910767,
+ 0.23985600471496582,
+ -1.0868161916732788,
+ -1.5830397605895996,
+ -0.041037797927856445,
+ -1.2574901580810547,
+ -0.5504958629608154,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_restnet_with_use_in_shortcut(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 32, 64, 64])
temb = paddle.randn(shape=[1, 128])
- resnet_block = ResnetBlock2D(
- in_channels=32, temb_channels=128, use_in_shortcut=True)
+ resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True)
with paddle.no_grad():
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 64, 64)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.9861348867416382,
- -1.097771406173706,
- 0.268703430891037,
- 0.40997087955474854,
- -4.26219367980957,
- 1.758486270904541,
- -0.8979732990264893,
- 0.30774950981140137,
- 3.2780206203460693,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.9861348867416382,
+ -1.097771406173706,
+ 0.268703430891037,
+ 0.40997087955474854,
+ -4.26219367980957,
+ 1.758486270904541,
+ -0.8979732990264893,
+ 0.30774950981140137,
+ 3.2780206203460693,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_resnet_up(self):
paddle.seed(0)
@@ -355,91 +355,92 @@ def test_resnet_up(self):
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 128, 128)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 0.2874237298965454,
- -2.6432056427001953,
- -2.1900298595428467,
- -0.48899877071380615,
- -1.1637755632400513,
- -1.084446907043457,
- -1.1333439350128174,
- 0.2726985812187195,
- -0.014697253704071045,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 0.2874237298965454,
+ -2.6432056427001953,
+ -2.1900298595428467,
+ -0.48899877071380615,
+ -1.1637755632400513,
+ -1.084446907043457,
+ -1.1333439350128174,
+ 0.2726985812187195,
+ -0.014697253704071045,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_resnet_down(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 32, 64, 64])
temb = paddle.randn(shape=[1, 128])
- resnet_block = ResnetBlock2D(
- in_channels=32, temb_channels=128, down=True)
+ resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True)
with paddle.no_grad():
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 32, 32)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 1.54087495803833,
- 0.26700693368911743,
- -0.540952742099762,
- 2.7190208435058594,
- -0.09766747057437897,
- 0.23407122492790222,
- 0.47980907559394836,
- 0.6348602771759033,
- -0.75424242019653322,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 1.54087495803833,
+ 0.26700693368911743,
+ -0.540952742099762,
+ 2.7190208435058594,
+ -0.09766747057437897,
+ 0.23407122492790222,
+ 0.47980907559394836,
+ 0.6348602771759033,
+ -0.75424242019653322,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_restnet_with_kernel_fir(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 32, 64, 64])
temb = paddle.randn(shape=[1, 128])
- resnet_block = ResnetBlock2D(
- in_channels=32, temb_channels=128, kernel="fir", down=True)
+ resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True)
with paddle.no_grad():
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 32, 32)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 0.9914248585700989,
- 0.4773162007331848,
- -0.021942138671875,
- 2.482321262359619,
- 0.18839354813098907,
- 0.1516135334968567,
- 0.7221578359603882,
- 0.3920581340789795,
- -0.24661940336227417,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 0.9914248585700989,
+ 0.4773162007331848,
+ -0.021942138671875,
+ 2.482321262359619,
+ 0.18839354813098907,
+ 0.1516135334968567,
+ 0.7221578359603882,
+ 0.3920581340789795,
+ -0.24661940336227417,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_restnet_with_kernel_sde_vp(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 32, 64, 64])
temb = paddle.randn(shape=[1, 128])
- resnet_block = ResnetBlock2D(
- in_channels=32, temb_channels=128, kernel="sde_vp", down=True)
+ resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True)
with paddle.no_grad():
output_tensor = resnet_block(sample, temb)
assert tuple(output_tensor.shape) == (1, 32, 32, 32)
output_slice = output_tensor[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 1.54087495803833,
- 0.26700693368911743,
- -0.540952742099762,
- 2.7190208435058594,
- -0.09766747057437897,
- 0.23407122492790222,
- 0.47980907559394836,
- 0.6348602771759033,
- -0.7542424201965332,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 1.54087495803833,
+ 0.26700693368911743,
+ -0.540952742099762,
+ 2.7190208435058594,
+ -0.09766747057437897,
+ 0.23407122492790222,
+ 0.47980907559394836,
+ 0.6348602771759033,
+ -0.7542424201965332,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
class AttentionBlockTests(unittest.TestCase):
@@ -451,50 +452,49 @@ def test_attention_block_default(self):
num_head_channels=1,
rescale_output_factor=1.0,
eps=1e-06,
- norm_num_groups=32, )
+ norm_num_groups=32,
+ )
with paddle.no_grad():
attention_scores = attentionBlock(sample)
assert attention_scores.shape == [1, 32, 64, 64]
output_slice = attention_scores[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 1.638939619064331,
- -0.15776772797107697,
- -1.1130025386810303,
- -0.8540273904800415,
- -0.5696781873703003,
- -2.0493741035461426,
- -0.3732607960700989,
- -1.740313172340393,
- -0.5271167755126953,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 1.638939619064331,
+ -0.15776772797107697,
+ -1.1130025386810303,
+ -0.8540273904800415,
+ -0.5696781873703003,
+ -2.0493741035461426,
+ -0.3732607960700989,
+ -1.740313172340393,
+ -0.5271167755126953,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_attention_block_sd(self):
paddle.seed(0)
sample = paddle.randn(shape=[1, 512, 64, 64])
- attentionBlock = AttentionBlock(
- channels=512,
- rescale_output_factor=1.0,
- eps=1e-06,
- norm_num_groups=32)
+ attentionBlock = AttentionBlock(channels=512, rescale_output_factor=1.0, eps=1e-06, norm_num_groups=32)
with paddle.no_grad():
attention_scores = attentionBlock(sample)
assert attention_scores.shape == [1, 512, 64, 64]
output_slice = attention_scores[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.8007570505142212,
- -0.770350992679596,
- -3.5278191566467285,
- -2.0540268421173096,
- -0.7711739540100098,
- -0.8278288245201111,
- -0.48292720317840576,
- 1.6039936542510986,
- 0.626724362373352,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.8007570505142212,
+ -0.770350992679596,
+ -3.5278191566467285,
+ -2.0540268421173096,
+ -0.7711739540100098,
+ -0.8278288245201111,
+ -0.48292720317840576,
+ 1.6039936542510986,
+ 0.626724362373352,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
class Transformer2DModelTests(unittest.TestCase):
@@ -506,24 +506,26 @@ def test_spatial_transformer_default(self):
num_attention_heads=1,
attention_head_dim=32,
dropout=0.0,
- cross_attention_dim=None, )
+ cross_attention_dim=None,
+ )
with paddle.no_grad():
attention_scores = spatial_transformer_block(sample).sample
assert attention_scores.shape == [1, 32, 64, 64]
output_slice = attention_scores[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 2.6310853958129883,
- 5.990478515625,
- 0.5715246200561523,
- -2.5269505977630615,
- -2.853764057159424,
- -5.163403511047363,
- 0.2880846858024597,
- -5.925153732299805,
- 2.316770076751709,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 2.6310853958129883,
+ 5.990478515625,
+ 0.5715246200561523,
+ -2.5269505977630615,
+ -2.853764057159424,
+ -5.163403511047363,
+ 0.2880846858024597,
+ -5.925153732299805,
+ 2.316770076751709,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_spatial_transformer_cross_attention_dim(self):
paddle.seed(0)
@@ -533,25 +535,27 @@ def test_spatial_transformer_cross_attention_dim(self):
num_attention_heads=2,
attention_head_dim=32,
dropout=0.0,
- cross_attention_dim=64, )
+ cross_attention_dim=64,
+ )
with paddle.no_grad():
context = paddle.randn(shape=[1, 4, 64])
attention_scores = spatial_transformer_block(sample, context).sample
assert attention_scores.shape == [1, 64, 64, 64]
output_slice = attention_scores[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- -0.08756911754608154,
- -3.94197940826416,
- -0.25678586959838867,
- 2.1481714248657227,
- 2.327033042907715,
- 0.29948690533638,
- 1.3845969438552856,
- 0.7825677394866943,
- 1.4856826066970825,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.08756911754608154,
+ -3.94197940826416,
+ -0.25678586959838867,
+ 2.1481714248657227,
+ 2.327033042907715,
+ 0.29948690533638,
+ 1.3845969438552856,
+ 0.7825677394866943,
+ 1.4856826066970825,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_spatial_transformer_timestep(self):
paddle.seed(0)
@@ -563,44 +567,45 @@ def test_spatial_transformer_timestep(self):
attention_head_dim=32,
dropout=0.0,
cross_attention_dim=64,
- num_embeds_ada_norm=num_embeds_ada_norm, )
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ )
with paddle.no_grad():
timestep_1 = paddle.to_tensor(1, dtype="int64")
timestep_2 = paddle.to_tensor(2, dtype="int64")
- attention_scores_1 = spatial_transformer_block(
- sample, timestep=timestep_1).sample
- attention_scores_2 = spatial_transformer_block(
- sample, timestep=timestep_2).sample
+ attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
+ attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
assert tuple(attention_scores_1.shape) == (1, 64, 64, 64)
assert tuple(attention_scores_2.shape) == (1, 64, 64, 64)
output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
- expected_slice_1 = paddle.to_tensor([
- -0.15322405099868774,
- -1.265586018562317,
- -5.424124717712402,
- -0.7333418130874634,
- -0.5904415249824524,
- 0.9293081760406494,
- 1.1033945083618164,
- -5.200987815856934,
- -0.7598087787628174,
- ])
- expected_slice_2 = paddle.to_tensor([
- 0.12572699785232544,
- -1.0498149394989014,
- -5.207070350646973,
- -0.41757693886756897,
- -0.25374162197113037,
- 1.152648687362671,
- 1.422953724861145,
- -4.933906078338623,
- -0.564710259437561,
- ])
- assert paddle.allclose(
- output_slice_1.flatten(), expected_slice_1, atol=0.01)
- assert paddle.allclose(
- output_slice_2.flatten(), expected_slice_2, atol=0.01)
+ expected_slice_1 = paddle.to_tensor(
+ [
+ -0.15322405099868774,
+ -1.265586018562317,
+ -5.424124717712402,
+ -0.7333418130874634,
+ -0.5904415249824524,
+ 0.9293081760406494,
+ 1.1033945083618164,
+ -5.200987815856934,
+ -0.7598087787628174,
+ ]
+ )
+ expected_slice_2 = paddle.to_tensor(
+ [
+ 0.12572699785232544,
+ -1.0498149394989014,
+ -5.207070350646973,
+ -0.41757693886756897,
+ -0.25374162197113037,
+ 1.152648687362671,
+ 1.422953724861145,
+ -4.933906078338623,
+ -0.564710259437561,
+ ]
+ )
+ assert paddle.allclose(output_slice_1.flatten(), expected_slice_1, atol=0.01)
+ assert paddle.allclose(output_slice_2.flatten(), expected_slice_2, atol=0.01)
def test_spatial_transformer_dropout(self):
paddle.seed(0)
@@ -610,24 +615,26 @@ def test_spatial_transformer_dropout(self):
num_attention_heads=2,
attention_head_dim=16,
dropout=0.3,
- cross_attention_dim=None, ).eval()
+ cross_attention_dim=None,
+ ).eval()
with paddle.no_grad():
attention_scores = spatial_transformer_block(sample).sample
assert attention_scores.shape == [1, 32, 64, 64]
output_slice = attention_scores[0, -1, -3:, -3:]
- expected_slice = paddle.to_tensor([
- 2.535370349884033,
- 6.2350993156433105,
- 0.8244613409042358,
- -2.6684911251068115,
- -2.758057117462158,
- -5.176937103271484,
- 0.3372979760169983,
- -5.837750434875488,
- 2.3483340740203857,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ 2.535370349884033,
+ 6.2350993156433105,
+ 0.8244613409042358,
+ -2.6684911251068115,
+ -2.758057117462158,
+ -5.176937103271484,
+ 0.3372979760169983,
+ -5.837750434875488,
+ 2.3483340740203857,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_spatial_transformer_discrete(self):
paddle.seed(0)
@@ -637,99 +644,75 @@ def test_spatial_transformer_discrete(self):
num_attention_heads=1,
attention_head_dim=32,
num_vector_embeds=num_embed,
- sample_size=16, ).eval()
+ sample_size=16,
+ ).eval()
with paddle.no_grad():
attention_scores = spatial_transformer_block(sample).sample
assert attention_scores.shape == [1, num_embed - 1, 32]
output_slice = attention_scores[0, -2:, -3:]
- expected_slice = paddle.to_tensor([
- -0.14130862057209015,
- -0.14278407394886017,
- -0.498604953289032,
- -3.2408740520477295,
- -3.852043390274048,
- -2.099970579147339,
- ])
- assert paddle.allclose(
- output_slice.flatten(), expected_slice, atol=0.01)
+ expected_slice = paddle.to_tensor(
+ [
+ -0.14130862057209015,
+ -0.14278407394886017,
+ -0.498604953289032,
+ -3.2408740520477295,
+ -3.852043390274048,
+ -2.099970579147339,
+ ]
+ )
+ assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
def test_spatial_transformer_default_norm_layers(self):
- spatial_transformer_block = Transformer2DModel(
- num_attention_heads=1, attention_head_dim=32, in_channels=32)
- assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__
- == paddle.nn.LayerNorm)
- assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__
- == paddle.nn.LayerNorm)
+ spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+ assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == paddle.nn.LayerNorm
+ assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
def test_spatial_transformer_ada_norm_layers(self):
spatial_transformer_block = Transformer2DModel(
num_attention_heads=1,
attention_head_dim=32,
in_channels=32,
- num_embeds_ada_norm=5, )
- assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__
- == AdaLayerNorm)
- assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__
- == paddle.nn.LayerNorm)
+ num_embeds_ada_norm=5,
+ )
+ assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
+ assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
def test_spatial_transformer_default_ff_layers(self):
- spatial_transformer_block = Transformer2DModel(
- num_attention_heads=1, attention_head_dim=32, in_channels=32)
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__
- == GEGLU)
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__
- == paddle.nn.Dropout)
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__
- == paddle.nn.Linear)
+ spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
dim = 32
inner_dim = 128
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
- .proj.weight.shape[0] == dim)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
- .proj.weight.shape[1] == inner_dim * 2)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
- .weight.shape[0] == inner_dim)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
- .weight.shape[1] == dim)
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim * 2
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
def test_spatial_transformer_geglu_approx_ff_layers(self):
spatial_transformer_block = Transformer2DModel(
num_attention_heads=1,
attention_head_dim=32,
in_channels=32,
- activation_fn="geglu-approximate", )
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__
- == ApproximateGELU)
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__
- == paddle.nn.Dropout)
- assert (
- spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__
- == paddle.nn.Linear)
+ activation_fn="geglu-approximate",
+ )
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
dim = 32
inner_dim = 128
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
- .proj.weight.shape[0] == dim)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
- .proj.weight.shape[1] == inner_dim)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
- .weight.shape[0] == inner_dim)
- assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
- .weight.shape[1] == dim)
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
+ assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
def test_spatial_transformer_attention_bias(self):
spatial_transformer_block = Transformer2DModel(
num_attention_heads=1,
attention_head_dim=32,
in_channels=32,
- attention_bias=True, )
- assert (spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias
- is not None)
- assert (spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias
- is not None)
- assert (spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias
- is not None)
+ attention_bias=True,
+ )
+ assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
+ assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
+ assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py
index 14c192e1e5ea8..97335fe48e3b5 100644
--- a/ppdiffusers/tests/models/test_lora_layers.py
+++ b/ppdiffusers/tests/models/test_lora_layers.py
@@ -20,8 +20,12 @@
import paddle.nn as nn
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, StableDiffusionPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
from ppdiffusers.models.attention_processor import LoRAAttnProcessor
from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, floats_tensor
@@ -30,19 +34,16 @@
def create_unet_lora_layers(unet: nn.Layer):
lora_attn_procs = {}
for name in unet.attn_processors.keys():
- cross_attention_dim = (None if name.endswith("attn1.processor") else
- unet.config.cross_attention_dim)
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(unet.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = unet.config.block_out_channels[block_id]
- lora_attn_procs[name] = LoRAAttnProcessor(
- hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
unet_lora_layers = AttnProcsLayers(lora_attn_procs)
return lora_attn_procs, unet_lora_layers
@@ -52,8 +53,8 @@ def create_text_encoder_lora_layers(text_encoder: nn.Layer):
for name, module in text_encoder.named_sublayers(include_self=True):
if name.endswith(TEXT_ENCODER_ATTN_MODULE):
text_lora_attn_procs[name] = LoRAAttnProcessor(
- hidden_size=module.out_proj.weight.shape[1],
- cross_attention_dim=None)
+ hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None
+ )
text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
return text_encoder_lora_layers
@@ -70,14 +71,16 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -85,7 +88,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
@@ -95,11 +99,11 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config)
text_encoder.eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
@@ -128,11 +132,7 @@ def get_dummy_inputs(self):
generator = paddle.Generator().manual_seed(0)
noise = floats_tensor((batch_size, num_channels) + sizes)
- input_ids = paddle.randint(
- 1,
- sequence_length,
- size=(batch_size, sequence_length),
- generator=generator)
+ input_ids = paddle.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
pipeline_inputs = {
"prompt": "A painting of a squirrel eating a burger",
@@ -158,22 +158,17 @@ def test_lora_save_load(self):
LoraLoaderMixin.save_lora_weights(
save_directory=tmpdirname,
unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components[
- "text_encoder_lora_layers"],
- to_diffusers=False, )
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ to_diffusers=False,
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
lora_images = sd_pipe(**pipeline_inputs).images
lora_image_slice = lora_images[0, -3:, -3:, -1]
# Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- paddle.to_tensor(orig_image_slice),
- paddle.to_tensor(lora_image_slice)))
+ self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
def test_lora_save_load_safetensors(self):
pipeline_components, lora_components = self.get_dummy_components()
@@ -189,24 +184,18 @@ def test_lora_save_load_safetensors(self):
LoraLoaderMixin.save_lora_weights(
save_directory=tmpdirname,
unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components[
- "text_encoder_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
safe_serialization=True,
- to_diffusers=True, )
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname,
- "pytorch_lora_weights.safetensors")))
+ to_diffusers=True,
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
sd_pipe.load_lora_weights(tmpdirname, from_diffusers=True)
lora_images = sd_pipe(**pipeline_inputs).images
lora_image_slice = lora_images[0, -3:, -3:, -1]
# Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- paddle.to_tensor(orig_image_slice),
- paddle.to_tensor(lora_image_slice)))
+ self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
def test_lora_save_load_legacy(self):
pipeline_components, lora_components = self.get_dummy_components()
@@ -223,16 +212,11 @@ def test_lora_save_load_legacy(self):
unet = sd_pipe.unet
unet.set_attn_processor(unet_lora_attn_procs)
unet.save_attn_procs(tmpdirname, to_diffusers=False)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
lora_images = sd_pipe(**pipeline_inputs).images
lora_image_slice = lora_images[0, -3:, -3:, -1]
# Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- paddle.to_tensor(orig_image_slice),
- paddle.to_tensor(lora_image_slice)))
+ self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py
index 2224b1d99e300..8780b3abc746b 100644
--- a/ppdiffusers/tests/models/test_modeling_common.py
+++ b/ppdiffusers/tests/models/test_modeling_common.py
@@ -45,12 +45,14 @@ def test_cached_files_are_used_when_no_internet(self):
response_mock.raise_for_status.side_effect = HTTPError
response_mock.json.return_value = {}
orig_model = UNet2DConditionModel.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet"
+ )
with mock.patch("requests.request", return_value=response_mock):
model = UNet2DConditionModel.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch",
subfolder="unet",
- local_files_only=True, )
+ local_files_only=True,
+ )
for p1, p2 in zip(orig_model.parameters(), model.parameters()):
if (p1 != p2).cast("int64").sum() > 0:
assert False, "Parameters not the same!"
@@ -67,13 +69,12 @@ def test_one_request_upon_cached(self):
subfolder="unet",
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
download_requests = [r.method for r in m.request_history]
- assert (download_requests.count("HEAD") == 2
- ), "2 HEAD requests one for config, one for model"
- assert (download_requests.count("GET") == 2
- ), "2 GET requests one for config, one for model"
+ assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model"
+ assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
with requests_mock.mock(real_http=True) as m:
UNet2DConditionModel.from_pretrained(
@@ -81,7 +82,8 @@ def test_one_request_upon_cached(self):
subfolder="unet",
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
cache_requests = [r.method for r in m.request_history]
# TODO check this
@@ -92,15 +94,15 @@ def test_one_request_upon_cached(self):
ppdiffusers.utils.import_utils._safetensors_available = True
def test_weight_overwrite(self):
- with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(
- RuntimeError) as error_context:
+ with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(RuntimeError) as error_context:
UNet2DConditionModel.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch",
subfolder="unet",
cache_dir=tmpdirname,
in_channels=9,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
# make sure that error message states what keys are missing
assert "size mismatch" in str(error_context.exception)
@@ -114,7 +116,8 @@ def test_weight_overwrite(self):
low_cpu_mem_usage=False,
ignore_mismatched_sizes=True,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
assert model.config.in_channels == 9
@@ -139,8 +142,7 @@ def test_from_save_pretrained(self):
if isinstance(new_image, dict):
new_image = new_image.sample
max_diff = (image - new_image).abs().sum().item()
- self.assertLessEqual(max_diff, 5e-05,
- "Models give different forward passes")
+ self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
def test_getattr_is_correct(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -183,10 +185,7 @@ def test_getattr_is_correct(self):
with self.assertRaises(AttributeError) as error:
model.does_not_exist
- assert (
- str(error.exception) ==
- f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
- )
+ assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
def test_from_save_pretrained_variant(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -196,8 +195,7 @@ def test_from_save_pretrained_variant(self):
model.eval()
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, variant="fp16")
- new_model = self.model_class.from_pretrained(
- tmpdirname, variant="fp16")
+ new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
if hasattr(new_model, "set_default_attn_processor"):
new_model.set_default_attn_processor()
# non-variant cannot be loaded
@@ -208,8 +206,7 @@ def test_from_save_pretrained_variant(self):
# support diffusion_pytorch_model.bin and model_state.pdparams
assert "Error no file named model_state.pdparams found in directory" in str(
error_context.exception
- ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(
- error_context.exception)
+ ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
with paddle.no_grad():
image = model(**inputs_dict)
@@ -219,8 +216,7 @@ def test_from_save_pretrained_variant(self):
if isinstance(new_image, dict):
new_image = new_image.sample
max_diff = (image - new_image).abs().sum().item()
- self.assertLessEqual(max_diff, 5e-05,
- "Models give different forward passes")
+ self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
def test_from_save_pretrained_dtype(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -231,11 +227,9 @@ def test_from_save_pretrained_dtype(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model.to(dtype=dtype)
model.save_pretrained(tmpdirname)
- new_model = self.model_class.from_pretrained(
- tmpdirname, paddle_dtype=dtype)
+ new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
assert new_model.dtype == dtype
- new_model = self.model_class.from_pretrained(
- tmpdirname, paddle_dtype=dtype)
+ new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
assert new_model.dtype == dtype
def test_determinism(self):
@@ -266,8 +260,7 @@ def test_output(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_forward_with_norm_groups(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -281,8 +274,7 @@ def test_forward_with_norm_groups(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_forward_signature(self):
init_dict, _ = self.prepare_init_args_and_inputs_for_common()
@@ -320,8 +312,7 @@ def test_training(self):
output = model(**inputs_dict)
if isinstance(output, dict):
output = output.sample
- noise = paddle.randn(
- shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape))
+ noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
loss = paddle.nn.functional.mse_loss(input=output, label=noise)
loss.backward()
@@ -333,8 +324,7 @@ def test_ema_training(self):
output = model(**inputs_dict)
if isinstance(output, dict):
output = output.sample
- noise = paddle.randn(
- shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape))
+ noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
loss = paddle.nn.functional.mse_loss(input=output, label=noise)
loss.backward()
ema_model.step(model.parameters())
@@ -346,12 +336,10 @@ def set_nan_tensor_to_zero(t):
def recursive_check(tuple_object, dict_object):
if isinstance(tuple_object, (List, Tuple)):
- for tuple_iterable_value, dict_iterable_value in zip(
- tuple_object, dict_object.values()):
+ for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif isinstance(tuple_object, Dict):
- for tuple_iterable_value, dict_iterable_value in zip(
- tuple_object.values(), dict_object.values()):
+ for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif tuple_object is None:
return
@@ -360,7 +348,8 @@ def recursive_check(tuple_object, dict_object):
paddle.allclose(
set_nan_tensor_to_zero(tuple_object),
set_nan_tensor_to_zero(dict_object),
- atol=1e-05, ),
+ atol=1e-05,
+ ),
msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.",
)
@@ -384,8 +373,7 @@ def test_enable_disable_gradient_checkpointing(self):
self.assertFalse(model.is_gradient_checkpointing)
def test_deprecated_kwargs(self):
- has_kwarg_in_model_class = (
- "kwargs" in inspect.signature(self.model_class.__init__).parameters)
+ has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
if has_kwarg_in_model_class and not has_deprecated_kwarg:
raise ValueError(
diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py
index 8ff48ee303f86..8d1339ed5c4dc 100644
--- a/ppdiffusers/tests/models/test_models_unet_1d.py
+++ b/ppdiffusers/tests/models/test_models_unet_1d.py
@@ -79,9 +79,9 @@ def prepare_init_args_and_inputs_for_common(self):
"DownResnetBlock1D",
"DownResnetBlock1D",
"DownResnetBlock1D",
- "DownResnetBlock1D", ),
- "up_block_types":
- ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
+ "DownResnetBlock1D",
+ ),
+ "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
"act_fn": "mish",
}
inputs_dict = self.dummy_input
@@ -91,38 +91,37 @@ def test_from_pretrained_hub(self):
model, loading_info = UNet1DModel.from_pretrained(
"bglick13/hopper-medium-v2-value-function-hor32",
output_loading_info=True,
- subfolder="unet", )
+ subfolder="unet",
+ )
self.assertIsNotNone(model)
self.assertEqual(len(loading_info["missing_keys"]), 0)
image = model(**self.dummy_input)
assert image is not None, "Make sure output is not None"
def test_output_pretrained(self):
- model = UNet1DModel.from_pretrained(
- "bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
+ model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
paddle.seed(0)
num_features = model.config.in_channels
seq_len = 16
- noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(
- perm=[0, 2, 1])
- time_step = paddle.full(shape=(num_features, ), fill_value=0)
+ noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
+ time_step = paddle.full(shape=(num_features,), fill_value=0)
with paddle.no_grad():
output = model(noise, time_step).sample.permute(0, 2, 1)
output_slice = output[0, -3:, -3:].flatten()
- expected_output_slice = paddle.to_tensor([
- -0.2857576608657837,
- -0.9908187389373779,
- 0.2976357340812683,
- -0.8677187561988831,
- -0.21778395771980286,
- 0.08095654845237732,
- -0.5871752500534058,
- 0.3299727439880371,
- -0.17421625554561615,
- ])
- self.assertTrue(
- paddle.allclose(
- output_slice, expected_output_slice, rtol=0.001))
+ expected_output_slice = paddle.to_tensor(
+ [
+ -0.2857576608657837,
+ -0.9908187389373779,
+ 0.2976357340812683,
+ -0.8677187561988831,
+ -0.21778395771980286,
+ 0.08095654845237732,
+ -0.5871752500534058,
+ 0.3299727439880371,
+ -0.17421625554561615,
+ ]
+ )
+ self.assertTrue(paddle.allclose(output_slice, expected_output_slice, rtol=0.001))
def test_forward_with_norm_groups(self):
pass
@@ -133,9 +132,9 @@ def test_unet_1d_maestro(self):
model_id = "harmonai/maestro-150k"
model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
sample_size = 65536
- noise = paddle.sin(x=paddle.arange(
- start=sample_size,
- dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1]))
+ noise = paddle.sin(
+ x=paddle.arange(start=sample_size, dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1])
+ )
timestep = paddle.to_tensor([1.0]) # must cast float32
with paddle.no_grad():
output = model(noise, timestep).sample
@@ -187,8 +186,7 @@ def test_output(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = [inputs_dict["sample"].shape[0], 1]
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_ema_training(self):
pass
@@ -225,7 +223,8 @@ def test_from_pretrained_hub(self):
value_function, vf_loading_info = UNet1DModel.from_pretrained(
"bglick13/hopper-medium-v2-value-function-hor32",
output_loading_info=True,
- subfolder="value_function", )
+ subfolder="value_function",
+ )
self.assertIsNotNone(value_function)
self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
image = value_function(**self.dummy_input)
@@ -235,19 +234,17 @@ def test_output_pretrained(self):
value_function, vf_loading_info = UNet1DModel.from_pretrained(
"bglick13/hopper-medium-v2-value-function-hor32",
output_loading_info=True,
- subfolder="value_function", )
+ subfolder="value_function",
+ )
paddle.seed(0)
num_features = value_function.config.in_channels
seq_len = 14
- noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(
- perm=[0, 2, 1])
- time_step = paddle.full(shape=(num_features, ), fill_value=0)
+ noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
+ time_step = paddle.full(shape=(num_features,), fill_value=0)
with paddle.no_grad():
output = value_function(noise, time_step).sample
expected_output_slice = paddle.to_tensor([291.51135254] * seq_len)
- self.assertTrue(
- paddle.allclose(
- output.squeeze(-1), expected_output_slice, rtol=0.001))
+ self.assertTrue(paddle.allclose(output.squeeze(-1), expected_output_slice, rtol=0.001))
def test_forward_with_norm_groups(self):
pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py
index 6473ab0323f19..15147e00742e8 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d.py
@@ -97,22 +97,19 @@ def prepare_init_args_and_inputs_for_common(self):
return init_dict, inputs_dict
def test_from_pretrained_hub(self):
- model, loading_info = UNet2DModel.from_pretrained(
- "fusing/unet-ldm-dummy-update", output_loading_info=True)
+ model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
self.assertIsNotNone(model)
self.assertEqual(len(loading_info["missing_keys"]), 0)
image = model(**self.dummy_input).sample
assert image is not None, "Make sure output is not None"
def test_from_pretrained_accelerate(self):
- model, _ = UNet2DModel.from_pretrained(
- "fusing/unet-ldm-dummy-update", output_loading_info=True)
+ model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
image = model(**self.dummy_input).sample
assert image is not None, "Make sure output is not None"
def test_from_pretrained_accelerate_wont_change_results(self):
- model_accelerate, _ = UNet2DModel.from_pretrained(
- "fusing/unet-ldm-dummy-update", output_loading_info=True)
+ model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
model_accelerate
model_accelerate.eval()
noise = paddle.randn(
@@ -122,7 +119,8 @@ def test_from_pretrained_accelerate_wont_change_results(self):
model_accelerate.config.sample_size,
model_accelerate.config.sample_size,
],
- generator=paddle.Generator().manual_seed(0), )
+ generator=paddle.Generator().manual_seed(0),
+ )
time_step = paddle.to_tensor([10] * noise.shape[0])
arr_accelerate = model_accelerate(noise, time_step)["sample"]
del model_accelerate
@@ -130,7 +128,8 @@ def test_from_pretrained_accelerate_wont_change_results(self):
gc.collect()
model_normal_load, _ = UNet2DModel.from_pretrained(
"fusing/unet-ldm-dummy-update",
- output_loading_info=True, )
+ output_loading_info=True,
+ )
model_normal_load.eval()
arr_normal_load = model_normal_load(noise, time_step)["sample"]
assert paddle_all_close(arr_accelerate, arr_normal_load, rtol=0.001)
@@ -145,25 +144,26 @@ def test_output_pretrained(self):
model.config.sample_size,
model.config.sample_size,
],
- generator=paddle.Generator().manual_seed(0), )
+ generator=paddle.Generator().manual_seed(0),
+ )
time_step = paddle.to_tensor([10] * noise.shape[0])
with paddle.no_grad():
output = model(noise, time_step).sample
output_slice = output[0, -1, -3:, -3:].flatten().cpu()
- expected_output_slice = paddle.to_tensor([
- 0.43855608,
- -10.29346752,
- -9.60953522,
- -8.39902020,
- -16.29206276,
- -13.07511997,
- -9.30383205,
- -13.69859409,
- -10.52999401,
- ])
- self.assertTrue(
- paddle_all_close(
- output_slice, expected_output_slice, rtol=0.001))
+ expected_output_slice = paddle.to_tensor(
+ [
+ 0.43855608,
+ -10.29346752,
+ -9.60953522,
+ -8.39902020,
+ -16.29206276,
+ -13.07511997,
+ -9.30383205,
+ -13.69859409,
+ -10.52999401,
+ ]
+ )
+ self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.001))
class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
@@ -213,8 +213,7 @@ def prepare_init_args_and_inputs_for_common(self):
@slow
def test_from_pretrained_hub(self):
- model, loading_info = UNet2DModel.from_pretrained(
- "google/ncsnpp-celebahq-256", output_loading_info=True)
+ model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
self.assertIsNotNone(model)
self.assertEqual(len(loading_info["missing_keys"]), 0)
inputs = self.dummy_input
@@ -235,24 +234,23 @@ def test_output_pretrained_ve_mid(self):
with paddle.no_grad():
output = model(noise, time_step).sample
output_slice = output[0, -3:, -3:, -1].flatten().cpu()
- expected_output_slice = paddle.to_tensor([
- -4836.2231,
- -6487.1387,
- -3816.7969,
- -7964.9253,
- -10966.2842,
- -20043.6016,
- 8137.0571,
- 2340.3499,
- 544.6114,
- ])
- self.assertTrue(
- paddle_all_close(
- output_slice, expected_output_slice, rtol=0.01))
+ expected_output_slice = paddle.to_tensor(
+ [
+ -4836.2231,
+ -6487.1387,
+ -3816.7969,
+ -7964.9253,
+ -10966.2842,
+ -20043.6016,
+ 8137.0571,
+ 2340.3499,
+ 544.6114,
+ ]
+ )
+ self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
def test_output_pretrained_ve_large(self):
- model = UNet2DModel.from_pretrained(
- "fusing/ncsnpp-ffhq-ve-dummy-update")
+ model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
paddle.seed(0)
batch_size = 4
num_channels = 3
@@ -262,13 +260,10 @@ def test_output_pretrained_ve_large(self):
with paddle.no_grad():
output = model(noise, time_step).sample
output_slice = output[0, -3:, -3:, -1].flatten().cpu()
- expected_output_slice = paddle.to_tensor([
- -0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227,
- 0.0256
- ])
- self.assertTrue(
- paddle_all_close(
- output_slice, expected_output_slice, rtol=0.01))
+ expected_output_slice = paddle.to_tensor(
+ [-0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227, 0.0256]
+ )
+ self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
def test_forward_with_norm_groups(self):
pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
index 085837f1fb0dd..6b9930399b0c3 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
@@ -24,9 +24,17 @@
from ppdiffusers import UNet2DConditionModel
from ppdiffusers.models.attention_processor import (
- CustomDiffusionAttnProcessor, LoRAAttnProcessor)
-from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy, logging,
- paddle_all_close, require_paddle_gpu, slow)
+ CustomDiffusionAttnProcessor,
+ LoRAAttnProcessor,
+)
+from ppdiffusers.utils import (
+ floats_tensor,
+ load_ppnlp_numpy,
+ logging,
+ paddle_all_close,
+ require_paddle_gpu,
+ slow,
+)
from ppdiffusers.utils.import_utils import is_ppxformers_available
from .test_modeling_common import ModelTesterMixin
@@ -34,50 +42,41 @@
logger = logging.get_logger(__name__)
-def create_lora_layers(model, mock_weights: bool=True):
+def create_lora_layers(model, mock_weights: bool = True):
lora_attn_procs = {}
for name in model.attn_processors.keys():
- cross_attention_dim = (None if name.endswith("attn1.processor") else
- model.config.cross_attention_dim)
+ cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = model.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(model.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(model.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = model.config.block_out_channels[block_id]
- lora_attn_procs[name] = LoRAAttnProcessor(
- hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
if mock_weights:
with paddle.no_grad():
- lora_attn_procs[name].to_q_lora.up.weight.set_value(
- lora_attn_procs[name].to_q_lora.up.weight + 1)
- lora_attn_procs[name].to_k_lora.up.weight.set_value(
- lora_attn_procs[name].to_k_lora.up.weight + 1)
- lora_attn_procs[name].to_v_lora.up.weight.set_value(
- lora_attn_procs[name].to_v_lora.up.weight + 1)
- lora_attn_procs[name].to_out_lora.up.weight.set_value(
- lora_attn_procs[name].to_out_lora.up.weight + 1)
+ lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+ lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+ lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+ lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
return lora_attn_procs
-def create_custom_ppdiffusion_layers(model, mock_weights: bool=True):
+def create_custom_ppdiffusion_layers(model, mock_weights: bool = True):
train_kv = True
train_q_out = True
custom_diffusion_attn_procs = {}
st = model.state_dict()
for name, _ in model.attn_processors.items():
- cross_attention_dim = (None if name.endswith("attn1.processor") else
- model.config.cross_attention_dim)
+ cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = model.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(model.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(model.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = model.config.block_out_channels[block_id]
@@ -87,36 +86,33 @@ def create_custom_ppdiffusion_layers(model, mock_weights: bool=True):
"to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
}
if train_q_out:
- weights["to_q_custom_diffusion.weight"] = st[layer_name +
- ".to_q.weight"]
- weights["to_out_custom_diffusion.0.weight"] = st[layer_name +
- ".to_out.0.weight"]
- weights["to_out_custom_diffusion.0.bias"] = st[layer_name +
- ".to_out.0.bias"]
+ weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+ weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+ weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
if cross_attention_dim is not None:
custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
train_kv=train_kv,
train_q_out=train_q_out,
hidden_size=hidden_size,
- cross_attention_dim=cross_attention_dim, )
+ cross_attention_dim=cross_attention_dim,
+ )
custom_diffusion_attn_procs[name].load_dict(weights)
if mock_weights:
# add 1 to weights to mock trained weights
with paddle.no_grad():
- custom_diffusion_attn_procs[
- name].to_k_custom_diffusion.weight.set_value(
- custom_diffusion_attn_procs[
- name].to_k_custom_diffusion.weight + 1)
- custom_diffusion_attn_procs[
- name].to_v_custom_diffusion.weight.set_value(
- custom_diffusion_attn_procs[
- name].to_v_custom_diffusion.weight + 1)
+ custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight.set_value(
+ custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight + 1
+ )
+ custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight.set_value(
+ custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight + 1
+ )
else:
custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
train_kv=False,
train_q_out=False,
hidden_size=hidden_size,
- cross_attention_dim=cross_attention_dim, )
+ cross_attention_dim=cross_attention_dim,
+ )
del st
return custom_diffusion_attn_procs
@@ -165,9 +161,10 @@ def test_xformers_enable_works(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**init_dict)
model.enable_xformers_memory_efficient_attention()
- assert (model.mid_block.attentions[0].transformer_blocks[0]
- .attn1.processor.__class__.__name__ == "XFormersAttnProcessor"
- ), "xformers is not enabled"
+ assert (
+ model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+ == "XFormersAttnProcessor"
+ ), "xformers is not enabled"
def test_gradient_checkpointing(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -190,9 +187,7 @@ def test_gradient_checkpointing(self):
named_params = dict(model.named_parameters())
named_params_2 = dict(model_2.named_parameters())
for name, param in named_params.items():
- self.assertTrue(
- paddle_all_close(
- param.grad, named_params_2[name].grad, atol=5e-05))
+ self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05))
def test_model_with_attention_head_dim_tuple(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -205,8 +200,7 @@ def test_model_with_attention_head_dim_tuple(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_model_with_use_linear_projection(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -219,8 +213,7 @@ def test_model_with_use_linear_projection(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_model_with_cross_attention_dim_tuple(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -238,8 +231,7 @@ def test_model_with_cross_attention_dim_tuple(self):
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_model_with_simple_projection(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -262,8 +254,7 @@ def test_model_with_simple_projection(self):
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_model_with_class_embeddings_concat(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -287,8 +278,7 @@ def test_model_with_class_embeddings_concat(self):
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
def test_model_attention_slicing(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -327,34 +317,32 @@ class AttnEasyProc(nn.Layer):
def __init__(self, num):
super().__init__()
self.weight = self.create_parameter(
- (1, ),
+ (1,),
dtype=paddle.get_default_dtype(),
- default_initializer=nn.initializer.Constant(num), )
+ default_initializer=nn.initializer.Constant(num),
+ )
self.is_run = False
self.number = 0
self.counter = 0
def __call__(
- self,
- attn,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- number=None, ):
+ self,
+ attn,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ number=None,
+ ):
batch_size, sequence_length, _ = hidden_states.shape
- attention_mask = attn.prepare_attention_mask(
- attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
- encoder_hidden_states = (encoder_hidden_states
- if encoder_hidden_states is not None
- else hidden_states)
+ encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query = attn.head_to_batch_dim(query)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
- attention_probs = attn.get_attention_scores(query, key,
- attention_mask)
+ attention_probs = attn.get_attention_scores(query, key, attention_mask)
hidden_states = paddle.matmul(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
hidden_states = attn.to_out[0](hidden_states)
@@ -385,12 +373,9 @@ def test_lora_processors(self):
model.set_attn_processor(lora_attn_procs)
model.set_attn_processor(model.attn_processors)
with paddle.no_grad():
- sample2 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
- sample3 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- sample4 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample1 - sample2).abs().max() < 0.0001
assert (sample3 - sample4).abs().max() < 0.0001
assert (sample2 - sample3).abs().max() > 0.0001
@@ -405,20 +390,16 @@ def test_lora_save_load(self):
lora_attn_procs = create_lora_layers(model)
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_attn_procs(tmpdirname, to_diffusers=False)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
new_model.load_attn_procs(tmpdirname, from_diffusers=False)
with paddle.no_grad():
- new_sample = new_model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample - new_sample).abs().max() < 1e-4
@@ -441,23 +422,16 @@ def test_lora_save_load_safetensors(self):
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(
- tmpdirname, safe_serialization=True, to_diffusers=True)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname,
- "pytorch_lora_weights.safetensors")))
+ model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(
- tmpdirname, from_diffusers=True, use_safetensors=True)
+ new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
with paddle.no_grad():
- new_sample = new_model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample - new_sample).abs().max() < 0.0001
assert (sample - old_sample).abs().max() > 0.0001
@@ -475,16 +449,15 @@ def test_lora_save_safetensors_load_torch(self):
# Saving as torch, properly reloads with directly filename
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_attn_procs(tmpdirname, to_diffusers=True)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
new_model.load_attn_procs(
tmpdirname,
weight_name="pytorch_lora_weights.bin",
from_diffusers=True,
- use_safetensors=False, )
+ use_safetensors=False,
+ )
def test_lora_save_torch_force_load_safetensors_error(self):
pass
@@ -499,8 +472,7 @@ def test_lora_on_off(self):
lora_attn_procs = create_lora_layers(model)
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
model.set_default_attn_processor()
with paddle.no_grad():
new_sample = model(**inputs_dict).sample
@@ -538,8 +510,7 @@ def test_custom_diffusion_processors(self):
with paddle.no_grad():
sample1 = model(**inputs_dict).sample
- custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
- model, mock_weights=False)
+ custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
# make sure we can set a list of attention processors
model.set_attn_processor(custom_diffusion_attn_procs)
@@ -564,8 +535,7 @@ def test_custom_diffusion_save_load(self):
with paddle.no_grad():
old_sample = model(**inputs_dict).sample
- custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
- model, mock_weights=False)
+ custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
model.set_attn_processor(custom_diffusion_attn_procs)
with paddle.no_grad():
@@ -573,16 +543,14 @@ def test_custom_diffusion_save_load(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_attn_procs(tmpdirname, to_diffusers=False)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname,
- "paddle_custom_diffusion_weights.pdparams")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_custom_diffusion_weights.pdparams")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
new_model.load_attn_procs(
tmpdirname,
weight_name="paddle_custom_diffusion_weights.pdparams",
- from_diffusers=False, )
+ from_diffusers=False,
+ )
with paddle.no_grad():
new_sample = new_model(**inputs_dict).sample
@@ -604,8 +572,7 @@ def test_custom_diffusion_xformers_on_off(self):
paddle.seed(0)
model = self.model_class(**init_dict)
- custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
- model, mock_weights=False)
+ custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
model.set_attn_processor(custom_diffusion_attn_procs)
# default
@@ -634,20 +601,15 @@ def tearDown(self):
def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
dtype = paddle.float16 if fp16 else paddle.float32
- image = paddle.to_tensor(data=load_ppnlp_numpy(
- self.get_file_format(seed, shape))).cast(dtype)
+ image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
return image
- def get_unet_model(self,
- fp16=False,
- model_id="CompVis/stable-diffusion-v1-4"):
+ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
revision = "fp16" if fp16 else None
paddle_dtype = paddle.float16 if fp16 else paddle.float32
model = UNet2DConditionModel.from_pretrained(
- model_id,
- subfolder="unet",
- paddle_dtype=paddle_dtype,
- revision=revision)
+ model_id, subfolder="unet", paddle_dtype=paddle_dtype, revision=revision
+ )
model.eval()
return model
@@ -659,10 +621,7 @@ def test_set_attention_slice_auto(self):
encoder_hidden_states = self.get_encoder_hidden_states(33)
timestep = 1
with paddle.no_grad():
- _ = unet(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
mem_bytes = paddle.device.cuda.memory_allocated()
assert mem_bytes < 5 * 10**9
@@ -674,10 +633,7 @@ def test_set_attention_slice_max(self):
encoder_hidden_states = self.get_encoder_hidden_states(33)
timestep = 1
with paddle.no_grad():
- _ = unet(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
mem_bytes = paddle.device.cuda.memory_allocated()
assert mem_bytes < 5 * 10**9
@@ -689,10 +645,7 @@ def test_set_attention_slice_int(self):
encoder_hidden_states = self.get_encoder_hidden_states(33)
timestep = 1
with paddle.no_grad():
- _ = unet(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
mem_bytes = paddle.device.cuda.memory_allocated()
assert mem_bytes < 5 * 10**9
@@ -705,49 +658,35 @@ def test_set_attention_slice_list(self):
encoder_hidden_states = self.get_encoder_hidden_states(33)
timestep = 1
with paddle.no_grad():
- _ = unet(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
mem_bytes = paddle.device.cuda.memory_allocated()
assert mem_bytes < 5 * 10**9
def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
dtype = "float16" if fp16 else "float32"
- hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(
- self.get_file_format(seed, shape))).cast(dtype)
+ hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
return hidden_states
- @parameterized.expand([
- [
- 33, 4,
- [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]
- ],
+ @parameterized.expand(
[
- 47,
- 0.55,
+ [33, 4, [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]],
[
- -0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719,
- -0.0207
+ 47,
+ 0.55,
+ [-0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719, -0.0207],
],
- ],
- [
- 21,
- 0.89,
[
- -0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091,
- 0.1778
+ 21,
+ 0.89,
+ [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778],
],
- ],
- [
- 9,
- 1000,
[
- 0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241,
- -0.4424
+ 9,
+ 1000,
+ [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
@@ -755,93 +694,69 @@ def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
encoder_hidden_states = self.get_encoder_hidden_states(seed)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == latents.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 83,
- 4,
[
- -0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125,
- -0.5806
+ 83,
+ 4,
+ [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806],
],
- ],
- [
- 17,
- 0.55,
[
- -0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743,
- 0.0701
+ 17,
+ 0.55,
+ [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701],
],
- ],
- [
- 8,
- 0.89,
[
- -0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839,
- 0.4639
+ 8,
+ 0.89,
+ [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639],
],
- ],
- [
- 3,
- 1000,
[
- -0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325,
- -1.0078
+ 3,
+ 1000,
+ [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
- model = self.get_unet_model(
- model_id="CompVis/stable-diffusion-v1-4", fp16=True)
+ model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
latents = self.get_latents(seed, fp16=True)
encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == latents.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
- @parameterized.expand([
- [
- 33, 4,
- [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]
- ],
+ @parameterized.expand(
[
- 47,
- 0.55,
+ [33, 4, [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
[
- -0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114,
- -0.0436
+ 47,
+ 0.55,
+ [-0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114, -0.0436],
],
- ],
- [
- 21,
- 0.89,
- [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175],
- ],
- [
- 9,
- 1000,
[
- 0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192,
- -0.4423
+ 21,
+ 0.89,
+ [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175],
],
- ],
- ])
+ [
+ 9,
+ 1000,
+ [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423],
+ ],
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
@@ -849,199 +764,151 @@ def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
encoder_hidden_states = self.get_encoder_hidden_states(seed)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == latents.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 83,
- 4,
[
- -0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395,
- -0.5972
+ 83,
+ 4,
+ [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972],
],
- ],
- [
- 17,
- 0.55,
[
- -0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669,
- 0.0322
+ 17,
+ 0.55,
+ [-0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322],
+ ],
+ [
+ 8,
+ 0.89,
+ [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319],
],
- ],
- [
- 8,
- 0.89,
- [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319],
- ],
- [
- 3,
- 1000,
[
- -0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028,
- -1.002
+ 3,
+ 1000,
+ [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028, -1.002],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
- model = self.get_unet_model(
- model_id="runwayml/stable-diffusion-v1-5", fp16=True)
+ model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
latents = self.get_latents(seed, fp16=True)
encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == latents.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
- @parameterized.expand([
+ @parameterized.expand(
[
- 33,
- 4,
[
- -0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085,
- -0.4858
+ 33,
+ 4,
+ [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858],
],
- ],
- [
- 47,
- 0.55,
[
- -0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169,
- 0.9073
+ 47,
+ 0.55,
+ [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073],
+ ],
+ [
+ 21,
+ 0.89,
+ [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043],
],
- ],
- [
- 21,
- 0.89,
- [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043],
- ],
- [
- 9,
- 1000,
[
- 0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149,
- -1.8931
+ 9,
+ 1000,
+ [0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149, -1.8931],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
- model = self.get_unet_model(
- model_id="runwayml/stable-diffusion-inpainting")
+ model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
latents = self.get_latents(seed, shape=(4, 9, 64, 64))
encoder_hidden_states = self.get_encoder_hidden_states(seed)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == [4, 4, 64, 64]
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 83,
- 4,
[
- -0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388,
- 1.1387
+ 83,
+ 4,
+ [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387],
],
- ],
- [
- 17,
- 0.55,
- [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026],
- ],
- [
- 8,
- 0.89,
[
- -0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395,
- -0.3486
+ 17,
+ 0.55,
+ [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026],
],
- ],
- [
- 3,
- 1000,
[
- 0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105,
- -0.9741
+ 8,
+ 0.89,
+ [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486],
],
- ],
- ])
+ [
+ 3,
+ 1000,
+ [0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741],
+ ],
+ ]
+ )
@require_paddle_gpu
def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
- model = self.get_unet_model(
- model_id="runwayml/stable-diffusion-inpainting", fp16=True)
+ model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == [4, 4, 64, 64]
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
- @parameterized.expand([
- [
- 83, 4,
- [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]
- ],
- [
- 17,
- 0.55,
- [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458],
- ],
+ @parameterized.expand(
[
- 8,
- 0.89,
+ [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]],
[
- -0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996,
- 0.2139
+ 17,
+ 0.55,
+ [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458],
+ ],
+ [
+ 8,
+ 0.89,
+ [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139],
],
- ],
- [
- 3,
- 1000,
[
- 0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234,
- -0.0539
+ 3,
+ 1000,
+ [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234, -0.0539],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
- model = self.get_unet_model(
- model_id="stabilityai/stable-diffusion-2", fp16=True)
+ model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
- encoder_hidden_states = self.get_encoder_hidden_states(
- seed, shape=(4, 77, 1024), fp16=True)
+ encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
timestep = paddle.to_tensor([timestep], dtype="int64")
with paddle.no_grad():
- sample = model(
- latents,
- timestep=timestep,
- encoder_hidden_states=encoder_hidden_states).sample
+ sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
assert sample.shape == latents.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
index 12479b35ac6f0..ca2f44b1edd9f 100644
--- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
@@ -20,8 +20,7 @@
import paddle
from ppdiffusers.models import UNet3DConditionModel
-from ppdiffusers.models.attention_processor import (AttnProcessor,
- LoRAAttnProcessor)
+from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
from ppdiffusers.utils import floats_tensor, logging
from ppdiffusers.utils.import_utils import is_ppxformers_available
@@ -30,20 +29,18 @@
logger = logging.get_logger(__name__)
-def create_lora_layers(model, mock_weights: bool=True):
+def create_lora_layers(model, mock_weights: bool = True):
lora_attn_procs = {}
for name in model.attn_processors.keys():
has_cross_attention = name.endswith("attn2.processor") and not (
- name.startswith("transformer_in") or
- "temp_attentions" in name.split("."))
- cross_attention_dim = (model.config.cross_attention_dim
- if has_cross_attention else None)
+ name.startswith("transformer_in") or "temp_attentions" in name.split(".")
+ )
+ cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
if name.startswith("mid_block"):
hidden_size = model.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(model.config.block_out_channels))[
- block_id]
+ hidden_size = list(reversed(model.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = model.config.block_out_channels[block_id]
@@ -51,20 +48,15 @@ def create_lora_layers(model, mock_weights: bool=True):
# Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
hidden_size = 8 * model.config.attention_head_dim
- lora_attn_procs[name] = LoRAAttnProcessor(
- hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
if mock_weights:
# add 1 to weights to mock trained weights
with paddle.no_grad():
- lora_attn_procs[name].to_q_lora.up.weight.set_value(
- lora_attn_procs[name].to_q_lora.up.weight + 1)
- lora_attn_procs[name].to_k_lora.up.weight.set_value(
- lora_attn_procs[name].to_k_lora.up.weight + 1)
- lora_attn_procs[name].to_v_lora.up.weight.set_value(
- lora_attn_procs[name].to_v_lora.up.weight + 1)
- lora_attn_procs[name].to_out_lora.up.weight.set_value(
- lora_attn_procs[name].to_out_lora.up.weight + 1)
+ lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+ lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+ lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+ lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
return lora_attn_procs
@@ -99,7 +91,8 @@ def prepare_init_args_and_inputs_for_common(self):
"block_out_channels": (32, 64),
"down_block_types": (
"CrossAttnDownBlock3D",
- "DownBlock3D", ),
+ "DownBlock3D",
+ ),
"up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
"cross_attention_dim": 32,
"attention_head_dim": 8,
@@ -121,9 +114,10 @@ def test_xformers_enable_works(self):
model.enable_xformers_memory_efficient_attention()
- assert (model.mid_block.attentions[0].transformer_blocks[0]
- .attn1.processor.__class__.__name__ == "XFormersAttnProcessor"
- ), "xformers is not enabled"
+ assert (
+ model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+ == "XFormersAttnProcessor"
+ ), "xformers is not enabled"
# Overriding to set `norm_num_groups` needs to be different for this model.
def test_forward_with_norm_groups(self):
@@ -140,8 +134,7 @@ def test_forward_with_norm_groups(self):
output = output.sample
self.assertIsNotNone(output)
expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape,
- "Input and output shapes do not match")
+ self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
# Overriding since the UNet3D outputs a different structure.
def test_determinism(self):
@@ -199,12 +192,9 @@ def test_lora_processors(self):
model.set_attn_processor(model.attn_processors)
with paddle.no_grad():
- sample2 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
- sample3 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- sample4 = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample1 - sample2).abs().max() < 1e-4
assert (sample3 - sample4).abs().max() < 1e-4
@@ -227,23 +217,20 @@ def test_lora_save_load(self):
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_attn_procs(
tmpdirname,
- to_diffusers=False, )
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ to_diffusers=False,
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
new_model.load_attn_procs(tmpdirname, from_diffusers=False)
with paddle.no_grad():
- new_sample = new_model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample - new_sample).abs().max() < 1e-4
@@ -265,24 +252,17 @@ def test_lora_save_load_safetensors(self):
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(
- tmpdirname, safe_serialization=True, to_diffusers=True)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname,
- "pytorch_lora_weights.safetensors")))
+ model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(
- tmpdirname, use_safetensors=True, from_diffusers=True)
+ new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
with paddle.no_grad():
- new_sample = new_model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
assert (sample - new_sample).abs().max() < 1e-4
@@ -303,16 +283,15 @@ def test_lora_save_safetensors_load_torch(self):
# Saving as paddle, properly reloads with directly filename
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_attn_procs(tmpdirname, to_diffusers=True)
- self.assertTrue(
- os.path.isfile(
- os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
paddle.seed(0)
new_model = self.model_class(**init_dict)
new_model.load_attn_procs(
tmpdirname,
weight_name="pytorch_lora_weights.bin",
use_safetensors=False,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
def test_lora_save_paddle_force_load_safetensors_error(self):
pass
@@ -332,8 +311,7 @@ def test_lora_on_off(self):
model.set_attn_processor(lora_attn_procs)
with paddle.no_grad():
- sample = model(
- **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
model.set_attn_processor(AttnProcessor())
diff --git a/ppdiffusers/tests/models/test_models_vae.py b/ppdiffusers/tests/models/test_models_vae.py
index 8cc3c0794fbd8..c385339e1b134 100644
--- a/ppdiffusers/tests/models/test_models_vae.py
+++ b/ppdiffusers/tests/models/test_models_vae.py
@@ -20,8 +20,13 @@
from parameterized import parameterized
from ppdiffusers import AutoencoderKL
-from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy,
- paddle_all_close, require_paddle_gpu, slow)
+from ppdiffusers.utils import (
+ floats_tensor,
+ load_ppnlp_numpy,
+ paddle_all_close,
+ require_paddle_gpu,
+ slow,
+)
from .test_modeling_common import ModelTesterMixin
@@ -100,13 +105,10 @@ def test_gradient_checkpointing(self):
named_params_2 = dict(model_2.named_parameters())
with paddle.no_grad():
for name, param in named_params.items():
- self.assertTrue(
- paddle_all_close(
- param.grad, named_params_2[name].grad, atol=5e-5))
+ self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5))
def test_from_pretrained_hub(self):
- model, loading_info = AutoencoderKL.from_pretrained(
- "fusing/autoencoder-kl-dummy", output_loading_info=True)
+ model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
self.assertIsNotNone(model)
self.assertEqual(len(loading_info["missing_keys"]), 0)
image = model(**self.dummy_input)
@@ -124,25 +126,25 @@ def test_output_pretrained(self):
model.config.sample_size,
model.config.sample_size,
],
- generator=paddle.Generator().manual_seed(0), )
+ generator=paddle.Generator().manual_seed(0),
+ )
with paddle.no_grad():
- output = model(
- image, sample_posterior=True, generator=generator).sample
+ output = model(image, sample_posterior=True, generator=generator).sample
output_slice = output[0, -1, -3:, -3:].flatten().cpu()
- expected_output_slice = paddle.to_tensor([
- -0.39049336,
- 0.34836933,
- 0.27105471,
- -0.02148458,
- 0.00975929,
- 0.27822807,
- -0.12224892,
- -0.02011922,
- 0.19761699,
- ])
- self.assertTrue(
- paddle_all_close(
- output_slice, expected_output_slice, rtol=0.01))
+ expected_output_slice = paddle.to_tensor(
+ [
+ -0.39049336,
+ 0.34836933,
+ 0.27105471,
+ -0.02148458,
+ 0.00975929,
+ 0.27822807,
+ -0.12224892,
+ -0.02011922,
+ 0.19761699,
+ ]
+ )
+ self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
@slow
@@ -157,115 +159,77 @@ def tearDown(self):
def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
dtype = paddle.float16 if fp16 else paddle.float32
- image = paddle.to_tensor(data=load_ppnlp_numpy(
- self.get_file_format(seed, shape))).cast(dtype)
+ image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
return image
- def get_sd_vae_model(self,
- model_id="CompVis/stable-diffusion-v1-4",
- fp16=False):
+ def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
revision = "fp16" if fp16 else None
paddle_dtype = paddle.float16 if fp16 else paddle.float32
- model = AutoencoderKL.from_pretrained(
- model_id,
- subfolder="vae",
- paddle_dtype=paddle_dtype,
- revision=revision)
+ model = AutoencoderKL.from_pretrained(model_id, subfolder="vae", paddle_dtype=paddle_dtype, revision=revision)
model.eval()
return model
def get_generator(self, seed=0):
return paddle.Generator().manual_seed(seed)
- @parameterized.expand([
- [
- 33,
- [
- -0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206,
- -0.0824
- ],
- [
- -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718,
- -0.1824
- ],
- ],
+ @parameterized.expand(
[
- 47,
[
- -0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493,
- -0.4089
+ 33,
+ [-0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206, -0.0824],
+ [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
],
[
- 0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633,
- -0.1131
+ 47,
+ [-0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493, -0.4089],
+ [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
],
- ],
- ])
+ ]
+ )
def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
model = self.get_sd_vae_model()
image = self.get_sd_image(seed)
generator = self.get_generator(seed)
with paddle.no_grad():
- sample = model(
- image, generator=generator, sample_posterior=True).sample
+ sample = model(image, generator=generator, sample_posterior=True).sample
assert sample.shape == image.shape
output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
- [
- 33, [
- -0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103,
- -0.0999
- ]
- ],
+ @parameterized.expand(
[
- 47, [
- -0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334,
- 0.2247
- ]
- ],
- ])
+ [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
+ [47, [-0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334, 0.2247]],
+ ]
+ )
@require_paddle_gpu
def test_stable_diffusion_fp16(self, seed, expected_slice):
model = self.get_sd_vae_model(fp16=True)
image = self.get_sd_image(seed, fp16=True)
generator = self.get_generator(seed)
with paddle.no_grad():
- sample = model(
- image, generator=generator, sample_posterior=True).sample
+ sample = model(image, generator=generator, sample_posterior=True).sample
assert sample.shape == image.shape
output_slice = sample[-1, -2:, :2, -2:].flatten().cast("float32").cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 33,
[
- -0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055,
- -0.0814
+ 33,
+ [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
+ [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
],
[
- -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718,
- -0.1824
+ 47,
+ [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
+ [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
],
- ],
- [
- 47,
- [
- -0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491,
- -0.4085
- ],
- [
- 0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633,
- -0.1131
- ],
- ],
- ])
- def test_stable_diffusion_mode(self, seed, expected_slice,
- expected_slice_mps):
+ ]
+ )
+ def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
model = self.get_sd_vae_model()
image = self.get_sd_image(seed)
with paddle.no_grad():
@@ -275,28 +239,27 @@ def test_stable_diffusion_mode(self, seed, expected_slice,
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 13,
[
- -0.2051,
- -0.1803,
- -0.2311,
- -0.2114,
- -0.3292,
- -0.3574,
- -0.2953,
- -0.3323,
+ 13,
+ [
+ -0.2051,
+ -0.1803,
+ -0.2311,
+ -0.2114,
+ -0.3292,
+ -0.3574,
+ -0.2953,
+ -0.3323,
+ ],
],
- ],
- [
- 37,
[
- -0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372,
- -0.4925
+ 37,
+ [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372, -0.4925],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_stable_diffusion_decode(self, seed, expected_slice):
model = self.get_sd_vae_model()
@@ -308,28 +271,27 @@ def test_stable_diffusion_decode(self, seed, expected_slice):
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
- @parameterized.expand([
+ @parameterized.expand(
[
- 27,
[
- -0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465,
- -0.2039
+ 27,
+ [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465, -0.2039],
],
- ],
- [
- 16,
[
- -0.1628,
- -0.2134,
- -0.2747,
- -0.2642,
- -0.3774,
- -0.4404,
- -0.3687,
- -0.4277,
+ 16,
+ [
+ -0.1628,
+ -0.2134,
+ -0.2747,
+ -0.2642,
+ -0.3774,
+ -0.4404,
+ -0.3687,
+ -0.4277,
+ ],
],
- ],
- ])
+ ]
+ )
@require_paddle_gpu
def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
model = self.get_sd_vae_model(fp16=True)
@@ -341,7 +303,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
expected_output_slice = paddle.to_tensor(expected_slice)
assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
- @parameterized.expand([(13, ), (16, ), (27, )])
+ @parameterized.expand([(13,), (16,), (27,)])
@require_paddle_gpu
def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed):
model = self.get_sd_vae_model(fp16=True)
@@ -358,7 +320,7 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed):
assert paddle_all_close(sample, sample_2, atol=1e-1)
- @parameterized.expand([(13, ), (16, ), (37, )])
+ @parameterized.expand([(13,), (16,), (37,)])
@require_paddle_gpu
def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed):
model = self.get_sd_vae_model()
@@ -375,36 +337,38 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed):
assert paddle_all_close(sample, sample_2, atol=1e-2)
- @parameterized.expand([
+ @parameterized.expand(
[
- 33,
[
- -0.3001,
- 0.0918,
- -2.6984,
- -3.972,
- -3.2099,
- -5.0353,
- 1.7338,
- -0.2065,
- 3.4267,
+ 33,
+ [
+ -0.3001,
+ 0.0918,
+ -2.6984,
+ -3.972,
+ -3.2099,
+ -5.0353,
+ 1.7338,
+ -0.2065,
+ 3.4267,
+ ],
],
- ],
- [
- 47,
[
- -1.503,
- -4.3871,
- -6.0355,
- -9.1157,
- -1.6661,
- -2.7853,
- 2.1607,
- -5.0823,
- 2.5633,
+ 47,
+ [
+ -1.503,
+ -4.3871,
+ -6.0355,
+ -9.1157,
+ -1.6661,
+ -2.7853,
+ 2.1607,
+ -5.0823,
+ 2.5633,
+ ],
],
- ],
- ])
+ ]
+ )
def test_stable_diffusion_encode_sample(self, seed, expected_slice):
model = self.get_sd_vae_model()
image = self.get_sd_image(seed)
@@ -412,11 +376,8 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice):
with paddle.no_grad():
dist = model.encode(image).latent_dist
sample = dist.sample(generator=generator)
- assert list(sample.shape) == [image.shape[0], 4] + [
- (i // 8) for i in image.shape[2:]
- ]
+ assert list(sample.shape) == [image.shape[0], 4] + [(i // 8) for i in image.shape[2:]]
output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
expected_output_slice = paddle.to_tensor(expected_slice)
tolerance = 0.01
- assert paddle_all_close(
- output_slice, expected_output_slice, atol=tolerance)
+ assert paddle_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/ppdiffusers/tests/models/test_models_vq.py b/ppdiffusers/tests/models/test_models_vq.py
index 9b19455a496b6..af2a6292d9353 100644
--- a/ppdiffusers/tests/models/test_models_vq.py
+++ b/ppdiffusers/tests/models/test_models_vq.py
@@ -60,8 +60,7 @@ def test_training(self):
pass
def test_from_pretrained_hub(self):
- model, loading_info = VQModel.from_pretrained(
- "fusing/vqgan-dummy", output_loading_info=True)
+ model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
self.assertIsNotNone(model)
self.assertEqual(len(loading_info["missing_keys"]), 0)
image = model(**self.dummy_input)
@@ -71,26 +70,28 @@ def test_output_pretrained(self):
model = VQModel.from_pretrained("fusing/vqgan-dummy")
model.eval()
paddle.seed(0)
- image = paddle.randn(shape=[
- 1,
- model.config.in_channels,
- model.config.sample_size,
- model.config.sample_size,
- ])
+ image = paddle.randn(
+ shape=[
+ 1,
+ model.config.in_channels,
+ model.config.sample_size,
+ model.config.sample_size,
+ ]
+ )
with paddle.no_grad():
output = model(image).sample
output_slice = output[0, -1, -3:, -3:].flatten().cpu()
- expected_output_slice = paddle.to_tensor([
- -0.027147896587848663,
- -0.41129639744758606,
- -0.17730756103992462,
- -0.5245445370674133,
- -0.2423611730337143,
- -0.3957087993621826,
- -0.16461530327796936,
- -0.06902074813842773,
- -0.01736617460846901,
- ])
- self.assertTrue(
- paddle.allclose(
- output_slice, expected_output_slice, atol=0.01))
+ expected_output_slice = paddle.to_tensor(
+ [
+ -0.027147896587848663,
+ -0.41129639744758606,
+ -0.17730756103992462,
+ -0.5245445370674133,
+ -0.2423611730337143,
+ -0.3957087993621826,
+ -0.16461530327796936,
+ -0.06902074813842773,
+ -0.01736617460846901,
+ ]
+ )
+ self.assertTrue(paddle.allclose(output_slice, expected_output_slice, atol=0.01))
diff --git a/ppdiffusers/tests/models/test_unet_2d_blocks.py b/ppdiffusers/tests/models/test_unet_2d_blocks.py
index df1fdae9f4acf..cfb2100ee38ba 100644
--- a/ppdiffusers/tests/models/test_unet_2d_blocks.py
+++ b/ppdiffusers/tests/models/test_unet_2d_blocks.py
@@ -16,13 +16,28 @@
import unittest
from ppdiffusers.models.unet_2d_blocks import (
- AttnDownBlock2D, AttnDownEncoderBlock2D, AttnSkipDownBlock2D,
- AttnSkipUpBlock2D, AttnUpBlock2D, AttnUpDecoderBlock2D,
- CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, DownEncoderBlock2D,
- ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, SimpleCrossAttnDownBlock2D,
- SimpleCrossAttnUpBlock2D, SkipDownBlock2D, SkipUpBlock2D, UNetMidBlock2D,
- UNetMidBlock2DCrossAttn, UNetMidBlock2DSimpleCrossAttn, UpBlock2D,
- UpDecoderBlock2D)
+ AttnDownBlock2D,
+ AttnDownEncoderBlock2D,
+ AttnSkipDownBlock2D,
+ AttnSkipUpBlock2D,
+ AttnUpBlock2D,
+ AttnUpDecoderBlock2D,
+ CrossAttnDownBlock2D,
+ CrossAttnUpBlock2D,
+ DownBlock2D,
+ DownEncoderBlock2D,
+ ResnetDownsampleBlock2D,
+ ResnetUpsampleBlock2D,
+ SimpleCrossAttnDownBlock2D,
+ SimpleCrossAttnUpBlock2D,
+ SkipDownBlock2D,
+ SkipUpBlock2D,
+ UNetMidBlock2D,
+ UNetMidBlock2DCrossAttn,
+ UNetMidBlock2DSimpleCrossAttn,
+ UpBlock2D,
+ UpDecoderBlock2D,
+)
from .test_unet_blocks_common import UNetBlockTesterMixin
@@ -89,8 +104,7 @@ class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
block_type = "down"
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
@@ -118,8 +132,7 @@ def dummy_input(self):
return super().get_dummy_input(include_encoder_hidden_states=True)
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
@@ -269,8 +282,7 @@ class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
block_type = "mid"
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
@@ -289,8 +301,7 @@ def test_output(self):
super().test_output(expected_slice)
-class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin,
- unittest.TestCase):
+class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
block_class = UNetMidBlock2DSimpleCrossAttn
block_type = "mid"
@@ -299,8 +310,7 @@ def dummy_input(self):
return super().get_dummy_input(include_encoder_hidden_states=True)
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
@@ -374,8 +384,7 @@ def dummy_input(self):
return super().get_dummy_input(include_res_hidden_states_tuple=True)
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
@@ -400,13 +409,10 @@ class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
@property
def dummy_input(self):
- return super().get_dummy_input(
- include_res_hidden_states_tuple=True,
- include_encoder_hidden_states=True)
+ return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True)
def prepare_init_args_and_inputs_for_common(self):
- init_dict, inputs_dict = super(
- ).prepare_init_args_and_inputs_for_common()
+ init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
init_dict["cross_attention_dim"] = 32
return init_dict, inputs_dict
diff --git a/ppdiffusers/tests/models/test_unet_blocks_common.py b/ppdiffusers/tests/models/test_unet_blocks_common.py
index 9f0920c87ef10..4595f43aec64d 100644
--- a/ppdiffusers/tests/models/test_unet_blocks_common.py
+++ b/ppdiffusers/tests/models/test_unet_blocks_common.py
@@ -35,16 +35,15 @@ def output_shape(self):
return 4, 32, 32, 32
elif self.block_type == "up":
return 4, 32, 64, 64
- raise ValueError(
- f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'."
- )
+ raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.")
def get_dummy_input(
- self,
- include_temb=True,
- include_res_hidden_states_tuple=False,
- include_encoder_hidden_states=False,
- include_skip_sample=False, ):
+ self,
+ include_temb=True,
+ include_res_hidden_states_tuple=False,
+ include_encoder_hidden_states=False,
+ include_skip_sample=False,
+ ):
batch_size = 4
num_channels = 32
sizes = 32, 32
@@ -54,28 +53,20 @@ def get_dummy_input(
dummy_input = {"hidden_states": hidden_states}
if include_temb:
temb_channels = 128
- dummy_input["temb"] = randn_tensor(
- (batch_size, temb_channels), generator=generator)
+ dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator)
if include_res_hidden_states_tuple:
generator_1 = paddle.Generator().manual_seed(1)
- dummy_input["res_hidden_states_tuple"] = (randn_tensor(
- shape, generator=generator_1), )
+ dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1),)
if include_encoder_hidden_states:
- dummy_input["encoder_hidden_states"] = floats_tensor(
- (batch_size, 32, 32))
+ dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32))
if include_skip_sample:
- dummy_input["skip_sample"] = randn_tensor(
- (batch_size, 3) + sizes, generator=generator)
+ dummy_input["skip_sample"] = randn_tensor((batch_size, 3) + sizes, generator=generator)
paddle.seed(0)
return dummy_input
def prepare_init_args_and_inputs_for_common(self):
- init_dict = {
- "in_channels": 32,
- "out_channels": 32,
- "temb_channels": 128
- }
+ init_dict = {"in_channels": 32, "out_channels": 32, "temb_channels": 128}
if self.block_type == "up":
init_dict["prev_output_channel"] = 32
if self.block_type == "mid":
@@ -94,8 +85,7 @@ def test_output(self, expected_slice):
self.assertEqual(list(output.shape), list(self.output_shape))
output_slice = output[0, -1, -3:, -3:]
expected_slice = paddle.to_tensor(expected_slice)
- assert paddle_all_close(
- output_slice.flatten(), expected_slice, atol=0.005)
+ assert paddle_all_close(output_slice.flatten(), expected_slice, atol=0.005)
def test_training(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/ppdiffusers/tests/others/test_config.py b/ppdiffusers/tests/others/test_config.py
index 171d2ea28e771..e4637ce2c35a3 100644
--- a/ppdiffusers/tests/others/test_config.py
+++ b/ppdiffusers/tests/others/test_config.py
@@ -16,10 +16,15 @@
import tempfile
import unittest
-from ppdiffusers import (DDIMScheduler, DDPMScheduler,
- DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler, PNDMScheduler, logging)
+from ppdiffusers import (
+ DDIMScheduler,
+ DDPMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ PNDMScheduler,
+ logging,
+)
from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
from ppdiffusers.utils.testing_utils import CaptureLogger
@@ -44,13 +49,7 @@ class SampleObject3(ConfigMixin):
config_name = "config.json"
@register_to_config
- def __init__(self,
- a=2,
- b=5,
- c=(2, 5),
- d="for diffusion",
- e=[1, 3],
- f=[1, 3]):
+ def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", e=[1, 3], f=[1, 3]):
pass
@@ -99,8 +98,7 @@ def test_save_load(self):
assert config["e"] == [1, 3]
with tempfile.TemporaryDirectory() as tmpdirname:
obj.save_config(tmpdirname)
- new_obj = SampleObject.from_config(
- SampleObject.load_config(tmpdirname))
+ new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
new_config = new_obj.config
config = dict(config)
new_config = dict(new_config)
@@ -114,8 +112,8 @@ def test_load_ddim_from_pndm(self):
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
ddim = DDIMScheduler.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+ )
assert ddim.__class__ == DDIMScheduler
assert cap_logger.out == ""
@@ -125,8 +123,8 @@ def test_load_euler_from_pndm(self):
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
euler = EulerDiscreteScheduler.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+ )
assert euler.__class__ == EulerDiscreteScheduler
assert cap_logger.out == ""
@@ -136,8 +134,8 @@ def test_load_euler_ancestral_from_pndm(self):
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
euler = EulerAncestralDiscreteScheduler.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+ )
assert euler.__class__ == EulerAncestralDiscreteScheduler
assert cap_logger.out == ""
@@ -147,8 +145,8 @@ def test_load_pndm(self):
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
pndm = PNDMScheduler.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+ )
assert pndm.__class__ == PNDMScheduler
assert cap_logger.out == ""
@@ -161,10 +159,10 @@ def test_overwrite_config_on_load(self):
"hf-internal-testing/tiny-stable-diffusion-torch",
subfolder="scheduler",
prediction_type="sample",
- beta_end=8, )
+ beta_end=8,
+ )
with CaptureLogger(logger) as cap_logger_2:
- ddpm_2 = DDPMScheduler.from_pretrained(
- "google/ddpm-celebahq-256", beta_start=88)
+ ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
assert ddpm.__class__ == DDPMScheduler
assert ddpm.config.prediction_type == "sample"
assert ddpm.config.beta_end == 8
@@ -178,7 +176,7 @@ def test_load_dpmsolver(self):
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
dpm = DPMSolverMultistepScheduler.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+ )
assert dpm.__class__ == DPMSolverMultistepScheduler
assert cap_logger.out == ""
diff --git a/ppdiffusers/tests/others/test_ema.py b/ppdiffusers/tests/others/test_ema.py
index 1ed2044e555e2..e8bd66abcfbee 100644
--- a/ppdiffusers/tests/others/test_ema.py
+++ b/ppdiffusers/tests/others/test_ema.py
@@ -33,13 +33,13 @@ class EMAModelTests(unittest.TestCase):
generator = paddle.Generator().manual_seed(0)
def get_models(self, decay=0.9999):
- unet = UNet2DConditionModel.from_pretrained(
- self.model_id, subfolder="unet")
+ unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
ema_unet = EMAModel(
unet.parameters(),
decay=decay,
model_cls=UNet2DConditionModel,
- model_config=unet.config, )
+ model_config=unet.config,
+ )
return unet, ema_unet
def get_dummy_inputs(self):
@@ -48,21 +48,23 @@ def get_dummy_inputs(self):
self.batch_size,
self.num_in_channels,
self.latent_height,
- self.latent_width, ),
- generator=self.generator, )
- timesteps = paddle.randint(
- 0, 1000, shape=(self.batch_size, ), generator=self.generator)
+ self.latent_width,
+ ),
+ generator=self.generator,
+ )
+ timesteps = paddle.randint(0, 1000, shape=(self.batch_size,), generator=self.generator)
encoder_hidden_states = paddle.randn(
(self.batch_size, self.prompt_length, self.text_encoder_hidden_dim),
- generator=self.generator, )
+ generator=self.generator,
+ )
return noisy_latents, timesteps, encoder_hidden_states
def simulate_backprop(self, unet):
updated_state_dict = {}
for k, param in unet.state_dict().items():
- updated_param = paddle.randn(
- param.shape, dtype=param.dtype) + (param * paddle.randn(
- param.shape, dtype=param.dtype))
+ updated_param = paddle.randn(param.shape, dtype=param.dtype) + (
+ param * paddle.randn(param.shape, dtype=param.dtype)
+ )
updated_state_dict.update({k: updated_param})
unet.load_dict(updated_state_dict)
return unet
@@ -131,8 +133,7 @@ def test_consecutive_shadow_params_updated(self):
ema_unet.step(unet_step_two.parameters())
step_two_shadow_params = ema_unet.shadow_params
- for step_one, step_two in zip(step_one_shadow_params,
- step_two_shadow_params):
+ for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
assert not paddle.allclose(step_one, step_two)
def test_zero_decay(self):
@@ -148,23 +149,19 @@ def test_zero_decay(self):
ema_unet.step(unet_step_two.parameters())
step_two_shadow_params = ema_unet.shadow_params
- for step_one, step_two in zip(step_one_shadow_params,
- step_two_shadow_params):
+ for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
assert paddle.allclose(step_one, step_two)
def test_serialization(self):
unet, ema_unet = self.get_models()
- noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs(
- )
+ noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
with tempfile.TemporaryDirectory() as tmpdir:
ema_unet.save_pretrained(tmpdir)
- loaded_unet = UNet2DConditionModel.from_pretrained(
- tmpdir, model_cls=UNet2DConditionModel)
+ loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
# Since no EMA step has been performed the outputs should match.
output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
- output_loaded = loaded_unet(noisy_latents, timesteps,
- encoder_hidden_states).sample
+ output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
assert paddle.allclose(output, output_loaded, atol=1e-4)
diff --git a/ppdiffusers/tests/others/test_image_processor.py b/ppdiffusers/tests/others/test_image_processor.py
index 054fe2b955ca9..e0c88c40e56b4 100644
--- a/ppdiffusers/tests/others/test_image_processor.py
+++ b/ppdiffusers/tests/others/test_image_processor.py
@@ -50,10 +50,10 @@ def test_vae_image_processor_pd(self):
for output_type in ["pd", "np", "pil"]:
out = image_processor.postprocess(
image_processor.preprocess(input_pd),
- output_type=output_type, )
+ output_type=output_type,
+ )
out_np = self.to_np(out)
- in_np = (input_np *
- 255).round() if output_type == "pil" else input_np
+ in_np = (input_np * 255).round() if output_type == "pil" else input_np
assert (
np.abs(in_np - out_np).max() < 1e-6
), f"decoded output does not match input for output_type {output_type}"
@@ -63,12 +63,10 @@ def test_vae_image_processor_np(self):
input_np = self.dummy_sample.transpose([0, 2, 3, 1]).cpu().numpy()
for output_type in ["pd", "np", "pil"]:
- out = image_processor.postprocess(
- image_processor.preprocess(input_np), output_type=output_type)
+ out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
out_np = self.to_np(out)
- in_np = (input_np *
- 255).round() if output_type == "pil" else input_np
+ in_np = (input_np * 255).round() if output_type == "pil" else input_np
assert (
np.abs(in_np - out_np).max() < 1e-6
), f"decoded output does not match input for output_type {output_type}"
@@ -80,12 +78,10 @@ def test_vae_image_processor_pil(self):
input_pil = image_processor.numpy_to_pil(input_np)
for output_type in ["pd", "np", "pil"]:
- out = image_processor.postprocess(
- image_processor.preprocess(input_pil), output_type=output_type)
+ out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
for i, o in zip(input_pil, out):
in_np = np.array(i)
- out_np = (self.to_np(out) if output_type == "pil" else
- (self.to_np(out) * 255).round())
+ out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
assert (
np.abs(in_np - out_np).max() < 1e-6
), f"decoded output does not match input for output_type {output_type}"
@@ -98,20 +94,24 @@ def test_preprocess_input_3d(self):
out_pt_4d = image_processor.postprocess(
image_processor.preprocess(input_pd_4d),
- output_type="np", )
+ output_type="np",
+ )
out_pt_3d = image_processor.postprocess(
image_processor.preprocess(input_pd_3d),
- output_type="np", )
+ output_type="np",
+ )
input_np_4d = self.to_np(self.dummy_sample)
input_np_3d = input_np_4d.squeeze(0)
out_np_4d = image_processor.postprocess(
image_processor.preprocess(input_np_4d),
- output_type="np", )
+ output_type="np",
+ )
out_np_3d = image_processor.postprocess(
image_processor.preprocess(input_np_3d),
- output_type="np", )
+ output_type="np",
+ )
assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
@@ -124,22 +124,26 @@ def test_preprocess_input_list(self):
out_pt_4d = image_processor.postprocess(
image_processor.preprocess(input_pd_4d),
- output_type="np", )
+ output_type="np",
+ )
out_pt_list = image_processor.postprocess(
image_processor.preprocess(input_pd_list),
- output_type="np", )
+ output_type="np",
+ )
input_np_4d = self.to_np(self.dummy_sample)
list(input_np_4d)
out_np_4d = image_processor.postprocess(
image_processor.preprocess(input_pd_4d),
- output_type="np", )
+ output_type="np",
+ )
out_np_list = image_processor.postprocess(
image_processor.preprocess(input_pd_list),
- output_type="np", )
+ output_type="np",
+ )
assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
assert np.abs(out_np_4d - out_np_list).max() < 1e-6
diff --git a/ppdiffusers/tests/others/test_training.py b/ppdiffusers/tests/others/test_training.py
index c52c0988951f2..12b72686eaed6 100644
--- a/ppdiffusers/tests/others/test_training.py
+++ b/ppdiffusers/tests/others/test_training.py
@@ -17,8 +17,7 @@
import paddle
-from ppdiffusers import (DDIMScheduler, DDPMScheduler, UNet2DConditionModel,
- UNet2DModel)
+from ppdiffusers import DDIMScheduler, DDPMScheduler, UNet2DConditionModel, UNet2DModel
from ppdiffusers.training_utils import set_seed
from ppdiffusers.utils.import_utils import is_ppxformers_available
from ppdiffusers.utils.testing_utils import slow
@@ -27,10 +26,8 @@
class UNet2DModelTrainingTests(unittest.TestCase):
def get_model_optimizer(self, resolution=32):
set_seed(0)
- model = UNet2DModel(
- sample_size=resolution, in_channels=3, out_channels=3)
- optimizer = paddle.optimizer.SGD(parameters=model.parameters(),
- learning_rate=0.0001)
+ model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3)
+ optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.0001)
return model, optimizer
@slow
@@ -40,34 +37,27 @@ def test_training_step_equality(self):
beta_start=0.0001,
beta_end=0.02,
beta_schedule="linear",
- clip_sample=True, )
+ clip_sample=True,
+ )
ddim_scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.0001,
beta_end=0.02,
beta_schedule="linear",
- clip_sample=True, )
- assert (ddpm_scheduler.config.num_train_timesteps ==
- ddim_scheduler.config.num_train_timesteps)
+ clip_sample=True,
+ )
+ assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
set_seed(0)
- clean_images = [
- paddle.randn(shape=(4, 3, 32, 32)).clip(
- min=-1, max=1) for _ in range(4)
- ]
+ clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
- timesteps = [
- paddle.randint(0, 1000, (4, )).astype(dtype="int64")
- for _ in range(4)
- ]
+ timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
model, optimizer = self.get_model_optimizer(resolution=32)
model.train()
for i in range(4):
optimizer.clear_grad()
- ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
- noise[i], timesteps[i])
+ ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample
- loss = paddle.nn.functional.mse_loss(
- input=ddpm_noise_pred, label=noise[i])
+ loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
loss.backward()
optimizer.step()
del model, optimizer
@@ -75,30 +65,22 @@ def test_training_step_equality(self):
model.train()
for i in range(4):
optimizer.clear_grad()
- ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i],
- noise[i], timesteps[i])
+ ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample
- loss = paddle.nn.functional.mse_loss(
- input=ddim_noise_pred, label=noise[i])
+ loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
loss.backward()
optimizer.step()
del model, optimizer
- self.assertTrue(
- paddle.allclose(
- ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
- self.assertTrue(
- paddle.allclose(
- ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
+ self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
+ self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
# new added
class UNet2DConditionModelTrainingTests(unittest.TestCase):
def get_model_optimizer(self, resolution=32):
set_seed(0)
- model = UNet2DConditionModel(
- sample_size=resolution, in_channels=3, out_channels=3)
- optimizer = paddle.optimizer.AdamW(
- parameters=model.parameters(), learning_rate=0.0001)
+ model = UNet2DConditionModel(sample_size=resolution, in_channels=3, out_channels=3)
+ optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=0.0001)
return model, optimizer
@slow
@@ -107,37 +89,31 @@ def test_training_step_equality(self):
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
ddim_scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
- beta_schedule="scaled_linear", )
- assert (ddpm_scheduler.config.num_train_timesteps ==
- ddim_scheduler.config.num_train_timesteps)
+ beta_schedule="scaled_linear",
+ )
+ assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
set_seed(0)
- clean_images = [
- paddle.randn(shape=(4, 3, 32, 32)).clip(
- min=-1, max=1) for _ in range(4)
- ]
+ clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
- timesteps = [
- paddle.randint(0, 1000, (4, )).astype(dtype="int64")
- for _ in range(4)
- ]
+ timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
model, optimizer = self.get_model_optimizer(resolution=32)
model.train()
for i in range(4):
optimizer.clear_grad()
- ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
- noise[i], timesteps[i])
+ ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
ddpm_noise_pred = model(
ddpm_noisy_images,
timesteps[i],
- encoder_hidden_states=text_embeddings[i], ).sample
- loss = paddle.nn.functional.mse_loss(
- input=ddpm_noise_pred, label=noise[i])
+ encoder_hidden_states=text_embeddings[i],
+ ).sample
+ loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
loss.backward()
optimizer.step()
del model, optimizer
@@ -145,23 +121,18 @@ def test_training_step_equality(self):
model.train()
for i in range(4):
optimizer.clear_grad()
- ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i],
- noise[i], timesteps[i])
+ ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
ddim_noise_pred = model(
ddim_noisy_images,
timesteps[i],
- encoder_hidden_states=text_embeddings[i], ).sample
- loss = paddle.nn.functional.mse_loss(
- input=ddim_noise_pred, label=noise[i])
+ encoder_hidden_states=text_embeddings[i],
+ ).sample
+ loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
loss.backward()
optimizer.step()
del model, optimizer
- self.assertTrue(
- paddle.allclose(
- ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
- self.assertTrue(
- paddle.allclose(
- ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
+ self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
+ self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
@unittest.skipIf(
not is_ppxformers_available(),
@@ -173,17 +144,12 @@ def test_recompute_xformers_training(self):
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
- beta_schedule="scaled_linear", )
+ beta_schedule="scaled_linear",
+ )
set_seed(0)
- clean_images = [
- paddle.randn(shape=(4, 3, 32, 32)).clip(
- min=-1, max=1) for _ in range(4)
- ]
+ clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
- timesteps = [
- paddle.randint(0, 1000, (4, )).astype(dtype="int64")
- for _ in range(4)
- ]
+ timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
model, optimizer = self.get_model_optimizer(resolution=32)
model.enable_gradient_checkpointing()
@@ -191,13 +157,12 @@ def test_recompute_xformers_training(self):
model.train()
for i in range(4):
optimizer.clear_grad()
- ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
- noise[i], timesteps[i])
+ ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
ddpm_noise_pred = model(
ddpm_noisy_images,
timesteps[i],
- encoder_hidden_states=text_embeddings[i], ).sample
- loss = paddle.nn.functional.mse_loss(
- input=ddpm_noise_pred, label=noise[i])
+ encoder_hidden_states=text_embeddings[i],
+ ).sample
+ loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
loss.backward()
optimizer.step()
diff --git a/ppdiffusers/tests/others/test_utils.py b/ppdiffusers/tests/others/test_utils.py
index 870e791a6f54b..ae27388bf5f60 100644
--- a/ppdiffusers/tests/others/test_utils.py
+++ b/ppdiffusers/tests/others/test_utils.py
@@ -20,34 +20,27 @@
class DeprecateTester(unittest.TestCase):
- higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] +
- __version__.split(".")[1:])
+ higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:])
lower_version = "0.0.1"
def test_deprecate_function_arg(self):
kwargs = {"deprecated_arg": 4}
with self.assertWarns(FutureWarning) as warning:
- output = deprecate(
- "deprecated_arg",
- self.higher_version,
- "message",
- take_from=kwargs)
+ output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
assert output == 4
assert (
- str(warning.warning) ==
- f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warning)
+ == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
)
def test_deprecate_function_arg_tuple(self):
kwargs = {"deprecated_arg": 4}
with self.assertWarns(FutureWarning) as warning:
- output = deprecate(
- ("deprecated_arg", self.higher_version, "message"),
- take_from=kwargs)
+ output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
assert output == 4
assert (
- str(warning.warning) ==
- f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warning)
+ == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
)
def test_deprecate_function_args(self):
@@ -56,49 +49,48 @@ def test_deprecate_function_args(self):
output_1, output_2 = deprecate(
("deprecated_arg_1", self.higher_version, "Hey"),
("deprecated_arg_2", self.higher_version, "Hey"),
- take_from=kwargs, )
+ take_from=kwargs,
+ )
assert output_1 == 4
assert output_2 == 8
assert (
- str(warning.warnings[0].message) ==
- f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey"
+ str(warning.warnings[0].message)
+ == f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey"
)
assert (
- str(warning.warnings[1].message) ==
- f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey"
+ str(warning.warnings[1].message)
+ == f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey"
)
def test_deprecate_function_incorrect_arg(self):
kwargs = {"deprecated_arg": 4}
with self.assertRaises(TypeError) as error:
- deprecate(
- ("wrong_arg", self.higher_version, "message"), take_from=kwargs)
- assert "test_deprecate_function_incorrect_arg in" in str(
- error.exception)
+ deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs)
+ assert "test_deprecate_function_incorrect_arg in" in str(error.exception)
assert "line" in str(error.exception)
- assert "got an unexpected keyword argument `deprecated_arg`" in str(
- error.exception)
+ assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
def test_deprecate_arg_no_kwarg(self):
with self.assertWarns(FutureWarning) as warning:
deprecate(("deprecated_arg", self.higher_version, "message"))
assert (
- str(warning.warning) ==
- f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warning)
+ == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
)
def test_deprecate_args_no_kwarg(self):
with self.assertWarns(FutureWarning) as warning:
deprecate(
("deprecated_arg_1", self.higher_version, "Hey"),
- ("deprecated_arg_2", self.higher_version, "Hey"), )
+ ("deprecated_arg_2", self.higher_version, "Hey"),
+ )
assert (
- str(warning.warnings[0].message) ==
- f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
+ str(warning.warnings[0].message)
+ == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
)
assert (
- str(warning.warnings[1].message) ==
- f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
+ str(warning.warnings[1].message)
+ == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
)
def test_deprecate_class_obj(self):
@@ -106,12 +98,11 @@ class Args:
arg = 5
with self.assertWarns(FutureWarning) as warning:
- arg = deprecate(
- ("arg", self.higher_version, "message"), take_from=Args())
+ arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
assert arg == 5
assert (
- str(warning.warning) ==
- f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warning)
+ == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
)
def test_deprecate_class_objs(self):
@@ -124,45 +115,45 @@ class Args:
("arg", self.higher_version, "message"),
("foo", self.higher_version, "message"),
("does not exist", self.higher_version, "message"),
- take_from=Args(), )
+ take_from=Args(),
+ )
assert arg_1 == 5
assert arg_2 == 7
assert (
- str(warning.warning) ==
- f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warning)
+ == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
)
assert (
- str(warning.warnings[0].message) ==
- f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warnings[0].message)
+ == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
)
assert (
- str(warning.warnings[1].message) ==
- f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
+ str(warning.warnings[1].message)
+ == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
)
def test_deprecate_incorrect_version(self):
kwargs = {"deprecated_arg": 4}
with self.assertRaises(ValueError) as error:
- deprecate(
- ("wrong_arg", self.lower_version, "message"), take_from=kwargs)
+ deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs)
assert (
- str(error.exception) ==
- f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}"
+ str(error.exception)
+ == f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}"
)
def test_deprecate_incorrect_no_standard_warn(self):
with self.assertWarns(FutureWarning) as warning:
deprecate(
- ("deprecated_arg", self.higher_version,
- "This message is better!!!"),
- standard_warn=False, )
+ ("deprecated_arg", self.higher_version, "This message is better!!!"),
+ standard_warn=False,
+ )
assert str(warning.warning) == "This message is better!!!"
def test_deprecate_stacklevel(self):
with self.assertWarns(FutureWarning) as warning:
deprecate(
- ("deprecated_arg", self.higher_version,
- "This message is better!!!"),
- standard_warn=False, )
+ ("deprecated_arg", self.higher_version, "This message is better!!!"),
+ standard_warn=False,
+ )
assert str(warning.warning) == "This message is better!!!"
assert "test_utils.py" in warning.filename
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
index f0804e24b9b35..e49767c5a033b 100644
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -18,14 +18,20 @@
import numpy as np
import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
- XLMRobertaTokenizer)
+from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
import ppdiffusers # noqa F401
-from ppdiffusers import (AltDiffusionPipeline, AutoencoderKL, DDIMScheduler,
- PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+ AltDiffusionPipeline,
+ AutoencoderKL,
+ DDIMScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
- RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+ RobertaSeriesConfig,
+ RobertaSeriesModelWithTransformation,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -48,13 +54,15 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -62,7 +70,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -74,11 +83,12 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=5002, )
+ vocab_size=5002,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
tokenizer = XLMRobertaTokenizer.from_pretrained(
- "hf-internal-testing/tiny-xlm-roberta",
- model_max_length=77) # must set model_max_length 77 here
+ "hf-internal-testing/tiny-xlm-roberta", model_max_length=77
+ ) # must set model_max_length 77 here
components = {
"unet": unet,
"scheduler": scheduler,
@@ -111,9 +121,9 @@ def test_alt_diffusion_ddim(self):
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
- vocab_size=5002, )
- text_encoder = RobertaSeriesModelWithTransformation(
- text_encoder_config).eval()
+ vocab_size=5002,
+ )
+ text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
components["text_encoder"] = text_encoder
alt_pipe = AltDiffusionPipeline(**components)
alt_pipe.set_progress_bar_config(disable=None)
@@ -123,17 +133,19 @@ def test_alt_diffusion_ddim(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.32336113,
- 0.2371237,
- 0.34009337,
- 0.22972241,
- 0.23742735,
- 0.4925817,
- 0.22020563,
- 0.20505491,
- 0.43374813,
- ])
+ expected_slice = np.array(
+ [
+ 0.32336113,
+ 0.2371237,
+ 0.34009337,
+ 0.22972241,
+ 0.23742735,
+ 0.4925817,
+ 0.22020563,
+ 0.20505491,
+ 0.43374813,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_alt_diffusion_pndm(self):
@@ -147,9 +159,9 @@ def test_alt_diffusion_pndm(self):
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
- vocab_size=5002, )
- text_encoder = RobertaSeriesModelWithTransformation(
- text_encoder_config).eval()
+ vocab_size=5002,
+ )
+ text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
components["text_encoder"] = text_encoder
alt_pipe = AltDiffusionPipeline(**components)
alt_pipe.set_progress_bar_config(disable=None)
@@ -158,17 +170,19 @@ def test_alt_diffusion_pndm(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.24095133,
- 0.26875997,
- 0.34291863,
- 0.2529385,
- 0.2736602,
- 0.49928105,
- 0.23973131,
- 0.21133915,
- 0.41810605,
- ])
+ expected_slice = np.array(
+ [
+ 0.24095133,
+ 0.26875997,
+ 0.34291863,
+ 0.2529385,
+ 0.2736602,
+ 0.49928105,
+ 0.23973131,
+ 0.21133915,
+ 0.41810605,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
@@ -181,8 +195,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_alt_diffusion(self):
- alt_pipe = AltDiffusionPipeline.from_pretrained(
- "BAAI/AltDiffusion", safety_checker=None)
+ alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
alt_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -191,48 +204,47 @@ def test_alt_diffusion(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=20,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.8718514442443848,
- 0.8715569972991943,
- 0.8748429417610168,
- 0.8708409070968628,
- 0.8782679438591003,
- 0.8931069374084473,
- 0.883078932762146,
- 0.881088376045227,
- 0.8617547154426575,
- ])
+ expected_slice = np.array(
+ [
+ 0.8718514442443848,
+ 0.8715569972991943,
+ 0.8748429417610168,
+ 0.8708409070968628,
+ 0.8782679438591003,
+ 0.8931069374084473,
+ 0.883078932762146,
+ 0.881088376045227,
+ 0.8617547154426575,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_alt_diffusion_fast_ddim(self):
- scheduler = DDIMScheduler.from_pretrained(
- "BAAI/AltDiffusion", subfolder="scheduler")
- alt_pipe = AltDiffusionPipeline.from_pretrained(
- "BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
+ scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+ alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
alt_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
- output = alt_pipe(
- [prompt],
- generator=generator,
- num_inference_steps=2,
- output_type="numpy")
+ output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.9265012741088867,
- 0.9305188059806824,
- 0.8999797105789185,
- 0.9346827268600464,
- 0.9264709949493408,
- 0.9447494745254517,
- 0.9428927898406982,
- 0.9417785406112671,
- 0.9157286882400513,
- ])
+ expected_slice = np.array(
+ [
+ 0.9265012741088867,
+ 0.9305188059806824,
+ 0.8999797105789185,
+ 0.9346827268600464,
+ 0.9264709949493408,
+ 0.9447494745254517,
+ 0.9428927898406982,
+ 0.9417785406112671,
+ 0.9157286882400513,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index ca070f3ff45ee..1422ec516f01d 100644
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -22,11 +22,17 @@
from paddlenlp.transformers import XLMRobertaTokenizer
import ppdiffusers # noqa F401
-from ppdiffusers import (AltDiffusionImg2ImgPipeline, AutoencoderKL,
- PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+ AltDiffusionImg2ImgPipeline,
+ AutoencoderKL,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.image_processor import VaeImageProcessor
from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
- RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+ RobertaSeriesConfig,
+ RobertaSeriesModelWithTransformation,
+)
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -42,8 +48,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -57,7 +62,8 @@ def dummy_cond_unet(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -69,7 +75,8 @@ def dummy_vae(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
return model
@property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=5006, )
+ vocab_size=5006,
+ )
return RobertaSeriesModelWithTransformation(config)
@property
@@ -106,8 +114,7 @@ def test_stable_diffusion_img2img_default_case(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = XLMRobertaTokenizer.from_pretrained(
- "hf-internal-testing/tiny-xlm-roberta")
+ tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
tokenizer.model_max_length = 77
init_image = self.dummy_image
alt_pipe = AltDiffusionImg2ImgPipeline(
@@ -117,9 +124,9 @@ def test_stable_diffusion_img2img_default_case(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
- alt_pipe.image_processor = VaeImageProcessor(
- vae_scale_factor=alt_pipe.vae_scale_factor)
+ feature_extractor=self.dummy_extractor,
+ )
+ alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
alt_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -129,7 +136,8 @@ def test_stable_diffusion_img2img_default_case(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- image=init_image, )
+ image=init_image,
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = alt_pipe(
@@ -139,24 +147,26 @@ def test_stable_diffusion_img2img_default_case(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.48931587,
- 0.40102208,
- 0.49653798,
- 0.4203022,
- 0.34621224,
- 0.50789315,
- 0.41116416,
- 0.4933398,
- 0.5465742,
- ])
+ expected_slice = np.array(
+ [
+ 0.48931587,
+ 0.40102208,
+ 0.49653798,
+ 0.4203022,
+ 0.34621224,
+ 0.50789315,
+ 0.41116416,
+ 0.4933398,
+ 0.5465742,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.005
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.005
def test_stable_diffusion_img2img_fp16(self):
"""Test that stable diffusion img2img works with fp16"""
@@ -164,8 +174,7 @@ def test_stable_diffusion_img2img_fp16(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = XLMRobertaTokenizer.from_pretrained(
- "hf-internal-testing/tiny-xlm-roberta")
+ tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
tokenizer.model_max_length = 77
init_image = self.dummy_image
unet = unet.to(dtype=paddle.float16)
@@ -178,9 +187,9 @@ def test_stable_diffusion_img2img_fp16(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
- alt_pipe.image_processor = VaeImageProcessor(
- vae_scale_factor=alt_pipe.vae_scale_factor)
+ feature_extractor=self.dummy_extractor,
+ )
+ alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
alt_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -189,7 +198,8 @@ def test_stable_diffusion_img2img_fp16(self):
generator=generator,
num_inference_steps=2,
output_type="np",
- image=init_image, ).images
+ image=init_image,
+ ).images
assert image.shape == (1, 32, 32, 3)
def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
@@ -198,8 +208,7 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
)
init_image = init_image.resize((760, 504))
model_id = "BAAI/AltDiffusion"
- pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
- model_id, safety_checker=None)
+ pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "A fantasy landscape, trending on artstation"
@@ -210,21 +219,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
strength=0.75,
guidance_scale=7.5,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
image_slice = image[255:258, 383:386, -1]
assert image.shape == (504, 760, 3)
- expected_slice = np.array([
- 0.3251649,
- 0.3340174,
- 0.3418343,
- 0.32628638,
- 0.33462793,
- 0.3300547,
- 0.31628466,
- 0.3470268,
- 0.34273332,
- ])
+ expected_slice = np.array(
+ [
+ 0.3251649,
+ 0.3340174,
+ 0.3418343,
+ 0.32628638,
+ 0.33462793,
+ 0.3300547,
+ 0.31628466,
+ 0.3470268,
+ 0.34273332,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
@@ -245,8 +257,7 @@ def test_stable_diffusion_img2img_pipeline_default(self):
# "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
# )
model_id = "BAAI/AltDiffusion"
- pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
- model_id, safety_checker=None)
+ pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "A fantasy landscape, trending on artstation"
@@ -257,19 +268,22 @@ def test_stable_diffusion_img2img_pipeline_default(self):
strength=0.75,
guidance_scale=7.5,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
assert image.shape == (1, 512, 768, 3)
image_slice = image[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.09987255930900574,
- 0.09875822067260742,
- 0.12803134322166443,
- 0.10067081451416016,
- 0.1142435073852539,
- 0.11815103888511658,
- 0.14216548204421997,
- 0.16465380787849426,
- 0.15393462777137756,
- ])
+ expected_slice = np.array(
+ [
+ 0.09987255930900574,
+ 0.09875822067260742,
+ 0.12803134322166443,
+ 0.10067081451416016,
+ 0.1142435073852539,
+ 0.11815103888511658,
+ 0.14216548204421997,
+ 0.16465380787849426,
+ 0.15393462777137756,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
index a8426c0ee78a1..e65d01ffc9eb8 100644
--- a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
@@ -19,9 +19,16 @@
import numpy as np
import paddle
-from ppdiffusers import (AudioDiffusionPipeline, AutoencoderKL, DDIMScheduler,
- DDPMScheduler, DiffusionPipeline, Mel,
- UNet2DConditionModel, UNet2DModel)
+from ppdiffusers import (
+ AudioDiffusionPipeline,
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ DiffusionPipeline,
+ Mel,
+ UNet2DConditionModel,
+ UNet2DModel,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -42,7 +49,8 @@ def dummy_unet(self):
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
- up_block_types=("UpBlock2D", "AttnUpBlock2D"), )
+ up_block_types=("UpBlock2D", "AttnUpBlock2D"),
+ )
return model
@property
@@ -56,7 +64,8 @@ def dummy_unet_condition(self):
block_out_channels=(128, 128),
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
- cross_attention_dim=10, )
+ cross_attention_dim=10,
+ )
return model
@property
@@ -70,7 +79,8 @@ def dummy_vqvae_and_unet(self):
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
- up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), )
+ up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
+ )
unet = UNet2DModel(
sample_size=(64, 32),
in_channels=1,
@@ -78,14 +88,14 @@ def dummy_vqvae_and_unet(self):
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
- up_block_types=("UpBlock2D", "AttnUpBlock2D"), )
+ up_block_types=("UpBlock2D", "AttnUpBlock2D"),
+ )
return vqvae, unet
def test_audio_diffusion(self):
mel = Mel()
scheduler = DDPMScheduler()
- pipe = AudioDiffusionPipeline(
- vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
+ pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(42)
output = pipe(generator=generator, steps=4)
@@ -96,55 +106,55 @@ def test_audio_diffusion(self):
image_from_tuple = output[0][0]
assert audio.shape == (
1,
- (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length, )
- assert (image.height == self.dummy_unet.config.sample_size[0] and
- image.width == self.dummy_unet.config.sample_size[1])
+ (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length,
+ )
+ assert (
+ image.height == self.dummy_unet.config.sample_size[0]
+ and image.width == self.dummy_unet.config.sample_size[1]
+ )
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
- image_from_tuple_slice = np.frombuffer(
- image_from_tuple.tobytes(), dtype="uint8")[:10]
+ image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
expected_slice = np.array([0, 252, 0, 160, 144, 1, 0, 211, 99, 3])
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) <= 5
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() <= 5
scheduler = DDIMScheduler()
dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
pipe = AudioDiffusionPipeline(
vqvae=self.dummy_vqvae_and_unet[0],
unet=dummy_vqvae_and_unet[1],
mel=mel,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
pipe.set_progress_bar_config(disable=None)
np.random.seed(0)
raw_audio = np.random.uniform(
-1,
1,
- ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) *
- mel.hop_length, ), )
+ ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * mel.hop_length,),
+ )
generator = paddle.Generator().manual_seed(42)
- output = pipe(
- raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
+ output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
image = output.images[0]
assert (
image.height == self.dummy_vqvae_and_unet[0].config.sample_size[0]
- and
- image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1])
+ and image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1]
+ )
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
- expected_slice = np.array(
- [128, 100, 153, 95, 92, 77, 130, 121, 81, 166])
+ expected_slice = np.array([128, 100, 153, 95, 92, 77, 130, 121, 81, 166])
assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
dummy_unet_condition = self.dummy_unet_condition
pipe = AudioDiffusionPipeline(
vqvae=self.dummy_vqvae_and_unet[0],
unet=dummy_unet_condition,
mel=mel,
- scheduler=scheduler, )
+ scheduler=scheduler,
+ )
np.random.seed(0)
encoding = paddle.rand(shape=(1, 1, 10))
output = pipe(generator=generator, encoding=encoding)
image = output.images[0]
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
- expected_slice = np.array(
- [139, 103, 88, 105, 100, 120, 116, 99, 106, 89])
+ expected_slice = np.array([139, 103, 88, 105, 100, 120, 116, 99, 106, 89])
assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
@@ -157,8 +167,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_audio_diffusion(self):
- pipe = DiffusionPipeline.from_pretrained(
- "teticio/audio-diffusion-ddim-256")
+ pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(42)
output = pipe(generator=generator)
@@ -166,10 +175,9 @@ def test_audio_diffusion(self):
image = output.images[0]
assert audio.shape == (
1,
- (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length, )
- assert (image.height == pipe.unet.config.sample_size[0] and
- image.width == pipe.unet.config.sample_size[1])
+ (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length,
+ )
+ assert image.height == pipe.unet.config.sample_size[0] and image.width == pipe.unet.config.sample_size[1]
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
- expected_slice = np.array(
- [151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
+ expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
diff --git a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
index 82c9242a44d2d..c9d67aaf82a83 100644
--- a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
+++ b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
@@ -18,13 +18,22 @@
import numpy as np
import paddle
import paddle.nn.functional as F
-from paddlenlp.transformers import (ClapTextConfig, ClapTextModelWithProjection,
- RobertaTokenizer, SpeechT5HifiGan,
- SpeechT5HifiGanConfig)
-
-from ppdiffusers import (AudioLDMPipeline, AutoencoderKL, DDIMScheduler,
- LMSDiscreteScheduler, PNDMScheduler,
- UNet2DConditionModel)
+from paddlenlp.transformers import (
+ ClapTextConfig,
+ ClapTextModelWithProjection,
+ RobertaTokenizer,
+ SpeechT5HifiGan,
+ SpeechT5HifiGanConfig,
+)
+
+from ppdiffusers import (
+ AudioLDMPipeline,
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.training_utils import enable_full_determinism
from ppdiffusers.utils import require_paddle_gpu, slow
@@ -39,16 +48,18 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
params = TEXT_TO_AUDIO_PARAMS
batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
test_xformers_attention = False
- required_optional_params = frozenset([
- "num_inference_steps",
- "num_waveforms_per_prompt",
- "generator",
- "latents",
- "output_type",
- "return_dict",
- "callback",
- "callback_steps",
- ])
+ required_optional_params = frozenset(
+ [
+ "num_inference_steps",
+ "num_waveforms_per_prompt",
+ "generator",
+ "latents",
+ "output_type",
+ "return_dict",
+ "callback",
+ "callback_steps",
+ ]
+ )
def get_dummy_components(self):
paddle.seed(0)
@@ -63,13 +74,15 @@ def get_dummy_components(self):
cross_attention_dim=(32, 64),
class_embed_type="simple_projection",
projection_class_embeddings_input_dim=32,
- class_embeddings_concat=True, )
+ class_embeddings_concat=True,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -77,7 +90,8 @@ def get_dummy_components(self):
out_channels=1,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = ClapTextConfig(
bos_token_id=0,
@@ -89,11 +103,11 @@ def get_dummy_components(self):
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
- projection_dim=32, )
+ projection_dim=32,
+ )
text_encoder = ClapTextModelWithProjection(text_encoder_config)
text_encoder.eval()
- tokenizer = RobertaTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-roberta", model_max_length=77)
+ tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
vocoder_config = SpeechT5HifiGanConfig(
model_in_dim=8,
@@ -103,7 +117,8 @@ def get_dummy_components(self):
upsample_kernel_sizes=[4, 4],
resblock_kernel_sizes=[3, 7],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
- normalize_before=False, )
+ normalize_before=False,
+ )
vocoder = SpeechT5HifiGan(vocoder_config)
vocoder.eval()
@@ -139,18 +154,20 @@ def test_audioldm_ddim(self):
assert len(audio) == 256
audio_slice = audio[:10]
- expected_slice = np.array([
- -0.0050,
- 0.0050,
- -0.0060,
- 0.0033,
- -0.0026,
- 0.0033,
- -0.0027,
- 0.0033,
- -0.0028,
- 0.0033,
- ])
+ expected_slice = np.array(
+ [
+ -0.0050,
+ 0.0050,
+ -0.0060,
+ 0.0033,
+ -0.0026,
+ 0.0033,
+ -0.0027,
+ 0.0033,
+ -0.0028,
+ 0.0033,
+ ]
+ )
assert np.abs(audio_slice - expected_slice).max() < 1e-2
@@ -175,10 +192,13 @@ def test_audioldm_prompt_embeds(self):
max_length=audioldm_pipe.tokenizer.model_max_length,
return_attention_mask=True,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_inputs = text_inputs["input_ids"].cast("int32")
- prompt_embeds = audioldm_pipe.text_encoder(text_inputs, )
+ prompt_embeds = audioldm_pipe.text_encoder(
+ text_inputs,
+ )
prompt_embeds = prompt_embeds.text_embeds
# additional L_2 normalization over each hidden-state
prompt_embeds = F.normalize(prompt_embeds, axis=-1)
@@ -216,10 +236,13 @@ def test_audioldm_negative_prompt_embeds(self):
max_length=audioldm_pipe.tokenizer.model_max_length,
truncation=True,
return_attention_mask=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_inputs = text_inputs["input_ids"].cast("int32")
- text_embeds = audioldm_pipe.text_encoder(text_inputs, )
+ text_embeds = audioldm_pipe.text_encoder(
+ text_inputs,
+ )
text_embeds = text_embeds.text_embeds
# additional L_2 normalization over each hidden-state
text_embeds = F.normalize(text_embeds, axis=-1)
@@ -249,18 +272,20 @@ def test_audioldm_negative_prompt(self):
assert len(audio) == 256
audio_slice = audio[:10]
- expected_slice = np.array([
- -0.0051,
- 0.0050,
- -0.0060,
- 0.0034,
- -0.0026,
- 0.0033,
- -0.0027,
- 0.0033,
- -0.0028,
- 0.0032,
- ])
+ expected_slice = np.array(
+ [
+ -0.0051,
+ 0.0050,
+ -0.0060,
+ 0.0034,
+ -0.0026,
+ 0.0033,
+ -0.0027,
+ 0.0033,
+ -0.0028,
+ 0.0032,
+ ]
+ )
assert np.abs(audio_slice - expected_slice).max() < 1e-2
@@ -278,8 +303,7 @@ def test_audioldm_num_waveforms_per_prompt(self):
# test num_waveforms_per_prompt=1 (default) for batch of prompts
batch_size = 2
- audios = audioldm_pipe(
- [prompt] * batch_size, num_inference_steps=2).audios
+ audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
assert audios.shape == (batch_size, 256)
@@ -288,7 +312,8 @@ def test_audioldm_num_waveforms_per_prompt(self):
audios = audioldm_pipe(
prompt,
num_inference_steps=2,
- num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios
+ num_waveforms_per_prompt=num_waveforms_per_prompt,
+ ).audios
assert audios.shape == (num_waveforms_per_prompt, 256)
@@ -297,7 +322,8 @@ def test_audioldm_num_waveforms_per_prompt(self):
audios = audioldm_pipe(
[prompt] * batch_size,
num_inference_steps=2,
- num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios
+ num_waveforms_per_prompt=num_waveforms_per_prompt,
+ ).audios
assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
@@ -339,12 +365,10 @@ def test_audioldm_vocoder_model_in_dim(self):
assert audio_shape == (1, 256)
def test_attention_slicing_forward_pass(self):
- self._test_attention_slicing_forward_pass(
- test_mean_pixel_difference=False)
+ self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(
- test_mean_pixel_difference=False)
+ self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
@slow
@@ -380,25 +404,26 @@ def test_audioldm(self):
assert len(audio) == 81920
audio_slice = audio[77230:77240]
- expected_slice = np.array([
- -0.4884,
- -0.4607,
- 0.0023,
- 0.5007,
- 0.5896,
- 0.5151,
- 0.3813,
- -0.0208,
- -0.3687,
- -0.4315,
- ])
+ expected_slice = np.array(
+ [
+ -0.4884,
+ -0.4607,
+ 0.0023,
+ 0.5007,
+ 0.5896,
+ 0.5151,
+ 0.3813,
+ -0.0208,
+ -0.3687,
+ -0.4315,
+ ]
+ )
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 1e-2
def test_audioldm_lms(self):
audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
- audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(
- audioldm_pipe.scheduler.config)
+ audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
@@ -408,17 +433,19 @@ def test_audioldm_lms(self):
assert len(audio) == 81920
audio_slice = audio[27780:27790]
- expected_slice = np.array([
- -0.2131,
- -0.0873,
- -0.0124,
- -0.0189,
- 0.0569,
- 0.1373,
- 0.1883,
- 0.2886,
- 0.3297,
- 0.2212,
- ])
+ expected_slice = np.array(
+ [
+ -0.2131,
+ -0.0873,
+ -0.0124,
+ -0.0189,
+ 0.0569,
+ 0.1373,
+ 0.1883,
+ 0.2886,
+ 0.3297,
+ 0.2212,
+ ]
+ )
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 3e-2
diff --git a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 9b76eed8898ad..b8477a5e775df 100644
--- a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -23,8 +23,10 @@
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS,
- UNCONDITIONAL_AUDIO_GENERATION_PARAMS)
+from ..pipeline_params import (
+ UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS,
+ UNCONDITIONAL_AUDIO_GENERATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -55,9 +57,9 @@ def get_dummy_components(self):
use_timestep_embedding=False,
time_embedding_type="fourier",
mid_block_type="UNetMidBlock1D",
- down_block_types=("DownBlock1DNoSkip", "DownBlock1D",
- "AttnDownBlock1D"),
- up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), )
+ down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+ up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+ )
scheduler = IPNDMScheduler()
components = {"unet": unet, "scheduler": scheduler}
return components
@@ -65,11 +67,7 @@ def get_dummy_components(self):
def get_dummy_inputs(self, seed=0):
generator = paddle.Generator().manual_seed(seed)
- inputs = {
- "batch_size": 1,
- "generator": generator,
- "num_inference_steps": 4
- }
+ inputs = {"batch_size": 1, "generator": generator, "num_inference_steps": 4}
return inputs
def test_dance_diffusion(self):
@@ -81,8 +79,7 @@ def test_dance_diffusion(self):
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, components["unet"].sample_size)
- expected_slice = np.array(
- [1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0])
+ expected_slice = np.array([1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0])
assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
@@ -98,42 +95,39 @@ def test_dance_diffusion(self):
pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- generator=generator,
- num_inference_steps=100,
- audio_length_in_s=4.096)
+ output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, pipe.unet.sample_size)
- expected_slice = np.array([
- -0.15758808,
- -0.15257765,
- -0.12701476,
- -0.26994032,
- -0.27616554,
- -0.24865153,
- ])
+ expected_slice = np.array(
+ [
+ -0.15758808,
+ -0.15257765,
+ -0.12701476,
+ -0.26994032,
+ -0.27616554,
+ -0.24865153,
+ ]
+ )
assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
def test_dance_diffusion_fp16(self):
- pipe = DanceDiffusionPipeline.from_pretrained(
- "harmonai/maestro-150k", paddle_dtype=paddle.float16)
+ pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- generator=generator,
- num_inference_steps=100,
- audio_length_in_s=4.096)
+ output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, pipe.unet.sample_size)
# scheduler use fp32
- expected_slice = np.array([
- -0.15350387,
- -0.14624646,
- -0.12091318,
- -0.25969276,
- -0.26154587,
- -0.23359495,
- ])
+ expected_slice = np.array(
+ [
+ -0.15350387,
+ -0.14624646,
+ -0.12091318,
+ -0.25969276,
+ -0.26154587,
+ -0.23359495,
+ ]
+ )
assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/ddim/test_ddim.py b/ppdiffusers/tests/pipelines/ddim/test_ddim.py
index c2fb14bc1020a..92f66001a03f4 100644
--- a/ppdiffusers/tests/pipelines/ddim/test_ddim.py
+++ b/ppdiffusers/tests/pipelines/ddim/test_ddim.py
@@ -21,8 +21,10 @@
from ppdiffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-from ..pipeline_params import (UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS,
- UNCONDITIONAL_IMAGE_GENERATION_PARAMS)
+from ..pipeline_params import (
+ UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS,
+ UNCONDITIONAL_IMAGE_GENERATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -47,7 +49,8 @@ def get_dummy_components(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
scheduler = DDIMScheduler()
components = {"unet": unet, "scheduler": scheduler}
return components
@@ -71,17 +74,19 @@ def test_inference(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
self.assertEqual(image.shape, (1, 32, 32, 3))
- expected_slice = np.array([
- 0.0,
- 0.00152004,
- 0.0,
- 0.0,
- 0.00860906,
- 0.00182715,
- 0.00189051,
- 1.0,
- 0.668702,
- ])
+ expected_slice = np.array(
+ [
+ 0.0,
+ 0.00152004,
+ 0.0,
+ 0.0,
+ 0.00860906,
+ 0.00182715,
+ 0.00189051,
+ 1.0,
+ 0.668702,
+ ]
+ )
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 0.001)
@@ -99,10 +104,7 @@ def test_inference_cifar10(self):
image = ddim(generator=generator, eta=0.0, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446,
- 0.2388
- ])
+ expected_slice = np.array([0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, 0.2388])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_inference_ema_bedroom(self):
@@ -115,15 +117,17 @@ def test_inference_ema_bedroom(self):
image = ddim(generator=generator, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 256, 256, 3)
- expected_slice = np.array([
- 0.19830778,
- 0.18826014,
- 0.18584034,
- 0.1927332,
- 0.18754855,
- 0.17855307,
- 0.18288234,
- 0.16375086,
- 0.1497818,
- ])
+ expected_slice = np.array(
+ [
+ 0.19830778,
+ 0.18826014,
+ 0.18584034,
+ 0.1927332,
+ 0.18754855,
+ 0.17855307,
+ 0.18288234,
+ 0.16375086,
+ 0.1497818,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
index 988129c546625..f2d25b2e39403 100644
--- a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
+++ b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
def test_fast_inference(self):
@@ -42,33 +43,33 @@ def test_fast_inference(self):
ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
ddpm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ddpm(
- generator=generator, num_inference_steps=2,
- output_type="numpy").images
+ image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = ddpm(
generator=generator,
num_inference_steps=2,
output_type="numpy",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.0,
- 0.0,
- 0.0,
- 0.0,
- 0.007474243640899658,
- 0.0,
- 0.007990598678588867,
- 0.9972629547119141,
- 0.6665917634963989,
- ])
+ expected_slice = np.array(
+ [
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.007474243640899658,
+ 0.0,
+ 0.007990598678588867,
+ 0.9972629547119141,
+ 0.6665917634963989,
+ ]
+ )
print(image_slice.flatten().tolist())
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_inference_predict_sample(self):
unet = self.dummy_uncond_unet
@@ -76,18 +77,14 @@ def test_inference_predict_sample(self):
ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
ddpm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ddpm(
- generator=generator, num_inference_steps=2,
- output_type="numpy").images
+ image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
- image_eps = ddpm(
- generator=generator, num_inference_steps=2, output_type="numpy")[0]
+ image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
image_slice = image[0, -3:, -3:, -1]
image_eps_slice = image_eps[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
tolerance = 0.01
- assert (np.abs(image_slice.flatten() - image_eps_slice.flatten()).max()
- < tolerance)
+ assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
@slow
@@ -103,8 +100,5 @@ def test_inference_cifar10(self):
image = ddpm(generator=generator, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604,
- 0.2020
- ])
+ expected_slice = np.array([0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, 0.2020])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
index 9f3a881a35c78..acb9a8a602116 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
@@ -30,13 +30,11 @@
class IFPipelineTesterMixin:
def _get_dummy_components(self):
paddle.seed(0)
- text_encoder = T5EncoderModel.from_pretrained(
- "hf-internal-testing/tiny-random-t5")
+ text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
text_encoder.eval()
paddle.seed(0)
- tokenizer = AutoTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-t5")
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
paddle.seed(0)
unet = UNet2DConditionModel(
@@ -48,9 +46,7 @@ def _get_dummy_components(self):
"SimpleCrossAttnDownBlock2D",
],
mid_block_type="UNetMidBlock2DSimpleCrossAttn",
- up_block_types=[
- "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"
- ],
+ up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
in_channels=3,
out_channels=6,
cross_attention_dim=32,
@@ -60,9 +56,9 @@ def _get_dummy_components(self):
addition_embed_type_num_heads=2,
cross_attention_norm="group_norm",
resnet_time_scale_shift="scale_shift",
- act_fn="gelu", )
- unet.set_attn_processor(
- AttnAddedKVProcessor()) # For reproducibility tests
+ act_fn="gelu",
+ )
+ unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests
paddle.seed(0)
scheduler = DDPMScheduler(
@@ -74,7 +70,8 @@ def _get_dummy_components(self):
dynamic_thresholding_ratio=0.95,
sample_max_value=1.0,
prediction_type="epsilon",
- variance_type="learned_range", )
+ variance_type="learned_range",
+ )
paddle.seed(0)
watermarker = IFWatermarker()
@@ -91,13 +88,11 @@ def _get_dummy_components(self):
def _get_superresolution_dummy_components(self):
paddle.seed(0)
- text_encoder = T5EncoderModel.from_pretrained(
- "hf-internal-testing/tiny-random-t5")
+ text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
text_encoder.eval()
paddle.seed(0)
- tokenizer = AutoTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-t5")
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
paddle.seed(0)
unet = UNet2DConditionModel(
@@ -109,9 +104,7 @@ def _get_superresolution_dummy_components(self):
"SimpleCrossAttnDownBlock2D",
],
mid_block_type="UNetMidBlock2DSimpleCrossAttn",
- up_block_types=[
- "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"
- ],
+ up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
in_channels=6,
out_channels=6,
cross_attention_dim=32,
@@ -125,9 +118,9 @@ def _get_superresolution_dummy_components(self):
class_embed_type="timestep",
mid_block_scale_factor=1.414,
time_embedding_act_fn="gelu",
- time_embedding_dim=32, )
- unet.set_attn_processor(
- AttnAddedKVProcessor()) # For reproducibility tests
+ time_embedding_dim=32,
+ )
+ unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests
paddle.seed(0)
scheduler = DDPMScheduler(
@@ -139,14 +132,16 @@ def _get_superresolution_dummy_components(self):
dynamic_thresholding_ratio=0.95,
sample_max_value=1.0,
prediction_type="epsilon",
- variance_type="learned_range", )
+ variance_type="learned_range",
+ )
paddle.seed(0)
image_noising_scheduler = DDPMScheduler(
num_train_timesteps=1000,
beta_schedule="squaredcos_cap_v2",
beta_start=0.0001,
- beta_end=0.02, )
+ beta_end=0.02,
+ )
paddle.seed(0)
watermarker = IFWatermarker()
@@ -226,8 +221,7 @@ def _test_save_load_optional_components(self):
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
pipe_loaded.set_progress_bar_config(disable=None)
- pipe_loaded.unet.set_attn_processor(
- AttnAddedKVProcessor()) # For reproducibility tests
+ pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests
for optional_component in pipe._optional_components:
self.assertTrue(
@@ -278,8 +272,7 @@ def _test_save_load_local(self):
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
pipe_loaded.set_progress_bar_config(disable=None)
- pipe_loaded.unet.set_attn_processor(
- AttnAddedKVProcessor()) # For reproducibility tests
+ pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests
inputs = self.get_dummy_inputs()
output_loaded = pipe_loaded(**inputs)[0]
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
index f5daacd7abdcb..4192ea593d45d 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
@@ -19,26 +19,31 @@
import paddle
from ppdiffusers import (
- IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline,
- IFInpaintingSuperResolutionPipeline, IFPipeline, IFSuperResolutionPipeline)
+ IFImg2ImgPipeline,
+ IFImg2ImgSuperResolutionPipeline,
+ IFInpaintingPipeline,
+ IFInpaintingSuperResolutionPipeline,
+ IFPipeline,
+ IFSuperResolutionPipeline,
+)
from ppdiffusers.models.attention_processor import AttnAddedKVProcessor
-from ppdiffusers.utils.testing_utils import (floats_tensor, load_numpy,
- require_paddle_gpu, slow)
+from ppdiffusers.utils.testing_utils import (
+ floats_tensor,
+ load_numpy,
+ require_paddle_gpu,
+ slow,
+)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (PipelineTesterMixin,
- assert_mean_pixel_difference)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
from . import IFPipelineTesterMixin
-class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
- unittest.TestCase):
+class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFPipeline
params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_dummy_components()
@@ -69,11 +74,12 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
@slow
@@ -88,24 +94,21 @@ def tearDown(self):
def test_all(self):
# if
- pipe_1 = IFPipeline.from_pretrained(
- "DeepFloyd/IF-I-XL-v1.0",
- variant="fp16",
- paddle_dtype=paddle.float16)
+ pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
pipe_2 = IFSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0",
variant="fp16",
paddle_dtype=paddle.float16,
text_encoder=None,
- tokenizer=None, )
+ tokenizer=None,
+ )
# pre compute text embeddings and remove T5 to save memory
pipe_1.text_encoder
- prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt(
- "anime turtle")
+ prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle")
del pipe_1.tokenizer
del pipe_1.text_encoder
@@ -136,8 +139,7 @@ def test_all(self):
pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
- self._test_if_img2img(pipe_1, pipe_2, prompt_embeds,
- negative_prompt_embeds)
+ self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
pipe_1.remove_all_hooks()
pipe_2.remove_all_hooks()
@@ -153,8 +155,7 @@ def test_all(self):
pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
- self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds,
- negative_prompt_embeds)
+ self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
# pipeline 1
@@ -165,7 +166,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
negative_prompt_embeds=negative_prompt_embeds,
num_inference_steps=2,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
@@ -191,7 +193,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
image=image,
generator=generator,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
@@ -205,8 +208,7 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
)
assert_mean_pixel_difference(image, expected_image)
- def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
- negative_prompt_embeds):
+ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
# pipeline 1
image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
@@ -219,7 +221,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
image=image,
num_inference_steps=2,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
@@ -247,7 +250,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
original_image=original_image,
generator=generator,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
@@ -261,8 +265,7 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
)
assert_mean_pixel_difference(image, expected_image)
- def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
- negative_prompt_embeds):
+ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
# pipeline 1
image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
@@ -276,7 +279,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
mask_image=mask_image,
num_inference_steps=2,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
@@ -306,7 +310,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
original_image=original_image,
generator=generator,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
index 3fce4eab7164b..bab44fc4a5cbf 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -20,20 +20,19 @@
from ppdiffusers import IFImg2ImgPipeline
from ppdiffusers.utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
from . import IFPipelineTesterMixin
-class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
- unittest.TestCase):
+class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_dummy_components()
@@ -58,8 +57,7 @@ def test_save_load_optional_components(self):
self._test_save_load_optional_components()
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
@@ -75,4 +73,6 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
index effd8aec47da6..0d977c5d6f2ee 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -20,21 +20,19 @@
from ppdiffusers import IFImg2ImgSuperResolutionPipeline
from ppdiffusers.utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
from . import IFPipelineTesterMixin
-class IFImg2ImgSuperResolutionPipelineFastTests(
- PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFImg2ImgSuperResolutionPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
- batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union(
- {"original_image"})
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"})
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_superresolution_dummy_components()
@@ -58,8 +56,7 @@ def get_dummy_inputs(self, seed=0):
return inputs
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
def test_save_load_optional_components(self):
self._test_save_load_optional_components()
@@ -75,4 +72,6 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
index 803ebffdb1ad5..e46b7c5ebea69 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -20,20 +20,19 @@
from ppdiffusers import IFInpaintingPipeline
from ppdiffusers.utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
from . import IFPipelineTesterMixin
-class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
- unittest.TestCase):
+class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFInpaintingPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_dummy_components()
@@ -57,8 +56,7 @@ def get_dummy_inputs(self, seed=0):
return inputs
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
def test_save_load_optional_components(self):
self._test_save_load_optional_components()
@@ -74,4 +72,6 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
index 0f24c066122e2..d50852284146e 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -20,21 +20,19 @@
from ppdiffusers import IFInpaintingSuperResolutionPipeline
from ppdiffusers.utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
from . import IFPipelineTesterMixin
-class IFInpaintingSuperResolutionPipelineFastTests(
- PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFInpaintingSuperResolutionPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
- batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union(
- {"original_image"})
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"})
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_superresolution_dummy_components()
@@ -60,8 +58,7 @@ def get_dummy_inputs(self, seed=0):
return inputs
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
def test_save_load_optional_components(self):
self._test_save_load_optional_components()
@@ -77,4 +74,6 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
index ae1810b58f991..79a7319b80757 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -20,20 +20,19 @@
from ppdiffusers import IFSuperResolutionPipeline
from ppdiffusers.utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
from . import IFPipelineTesterMixin
-class IFSuperResolutionPipelineFastTests(
- PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
pipeline_class = IFSuperResolutionPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
return self._get_superresolution_dummy_components()
@@ -55,8 +54,7 @@ def get_dummy_inputs(self, seed=0):
return inputs
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
def test_save_load_optional_components(self):
self._test_save_load_optional_components()
@@ -72,4 +70,6 @@ def test_save_load_local(self):
self._test_save_load_local()
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+ self._test_inference_batch_single_identical(
+ expected_max_diff=1e-2,
+ )
diff --git a/ppdiffusers/tests/pipelines/dit/test_dit.py b/ppdiffusers/tests/pipelines/dit/test_dit.py
index ffbe5d6d4dc33..c9d17607fcbd0 100644
--- a/ppdiffusers/tests/pipelines/dit/test_dit.py
+++ b/ppdiffusers/tests/pipelines/dit/test_dit.py
@@ -19,13 +19,20 @@
import numpy as np
import paddle
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiTPipeline,
- DPMSolverMultistepScheduler, Transformer2DModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DiTPipeline,
+ DPMSolverMultistepScheduler,
+ Transformer2DModel,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
- CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS)
+from ..pipeline_params import (
+ CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
+ CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -55,7 +62,8 @@ def get_dummy_components(self):
activation_fn="gelu-approximate",
num_embeds_ada_norm=1000,
norm_type="ada_norm_zero",
- norm_elementwise_affine=False, )
+ norm_elementwise_affine=False,
+ )
vae = AutoencoderKL()
scheduler = DDIMScheduler()
components = {
@@ -85,20 +93,15 @@ def test_inference(self):
image_slice = image[0, -3:, -3:, -1]
self.assertEqual(image.shape, (1, 16, 16, 3))
print(image_slice.flatten())
- expected_slice = np.array([
- 0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0,
- 0.14398015
- ])
+ expected_slice = np.array([0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0, 0.14398015])
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 0.001)
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(
- relax_max_difference=True, expected_max_diff=1e-3)
+ self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-3)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
@require_paddle_gpu
@@ -116,35 +119,35 @@ def test_dit_256(self):
words = ["vase", "umbrella", "white shark", "white wolf"]
ids = pipe.get_label_ids(words)
- images = pipe(
- ids, generator=generator, num_inference_steps=40,
- output_type="np").images
- expected_slices = np.array([
- [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
- [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0],
+ images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
+ expected_slices = np.array(
[
- 0.434637188911438,
- 0.4323567748069763,
- 0.4406988322734833,
- 0.442973256111145,
- 0.4462621212005615,
- 0.45129328966140747,
- 0.41893237829208374,
- 0.42390328645706177,
- 0.3906112015247345,
- ],
- [
- 0.9986965656280518,
- 0.9948190450668335,
- 0.9841029644012451,
- 0.9911775588989258,
- 0.9871039390563965,
- 0.9874314069747925,
- 0.9822297096252441,
- 0.9997426271438599,
- 1.0,
- ],
- ])
+ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+ [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0],
+ [
+ 0.434637188911438,
+ 0.4323567748069763,
+ 0.4406988322734833,
+ 0.442973256111145,
+ 0.4462621212005615,
+ 0.45129328966140747,
+ 0.41893237829208374,
+ 0.42390328645706177,
+ 0.3906112015247345,
+ ],
+ [
+ 0.9986965656280518,
+ 0.9948190450668335,
+ 0.9841029644012451,
+ 0.9911775588989258,
+ 0.9871039390563965,
+ 0.9874314069747925,
+ 0.9822297096252441,
+ 0.9997426271438599,
+ 1.0,
+ ],
+ ]
+ )
for word, image, expected_slice in zip(words, images, expected_slices):
# expected_image = load_numpy(
@@ -152,37 +155,34 @@ def test_dit_256(self):
# )
assert image.shape == (256, 256, 3)
image_slice = image[-3:, -3:, -1]
- assert np.abs((image_slice.flatten() - expected_slice).max(
- )) < 0.001
+ assert np.abs((image_slice.flatten() - expected_slice).max()) < 0.001
def test_dit_512_fp16(self):
- pipe = DiTPipeline.from_pretrained(
- "facebook/DiT-XL-2-512", paddle_dtype=paddle.float16)
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- pipe.scheduler.config)
+ pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512", paddle_dtype=paddle.float16)
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("gpu")
words = ["vase", "umbrella"]
ids = pipe.get_label_ids(words)
generator = paddle.Generator().manual_seed(0)
- images = pipe(
- ids, generator=generator, num_inference_steps=25,
- output_type="np").images
+ images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
- expected_slices = np.array([
- [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625],
+ expected_slices = np.array(
[
- 0.0,
- 0.0,
- 0.01708984375,
- 0.024658203125,
- 0.0830078125,
- 0.134521484375,
- 0.175537109375,
- 0.33740234375,
- 0.207763671875,
- ],
- ])
+ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625],
+ [
+ 0.0,
+ 0.0,
+ 0.01708984375,
+ 0.024658203125,
+ 0.0830078125,
+ 0.134521484375,
+ 0.175537109375,
+ 0.33740234375,
+ 0.207763671875,
+ ],
+ ]
+ )
for word, image, expected_slice in zip(words, images, expected_slices):
# expected_image = load_numpy(
diff --git a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
index aff5775323867..da80059ddfdc4 100644
--- a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = pipe(
- num_inference_steps=2, generator=generator,
- output_type="numpy").images
+ image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = pipe(
num_inference_steps=2,
generator=generator,
output_type="numpy",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
@slow
@@ -70,20 +69,20 @@ def test_inference(self):
pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = pipe(
- num_inference_steps=20, generator=generator,
- output_type="numpy").images
+ image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 256, 256, 3)
- expected_slice = np.array([
- 0.7528239,
- 0.7529462,
- 0.76014197,
- 0.75482357,
- 0.75692874,
- 0.7577723,
- 0.760527,
- 0.758951,
- 0.7599246,
- ])
+ expected_slice = np.array(
+ [
+ 0.7528239,
+ 0.7529462,
+ 0.76014197,
+ 0.75482357,
+ 0.75692874,
+ 0.7577723,
+ 0.760527,
+ 0.758951,
+ 0.7599246,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 93583e8814480..3bdb01281a103 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -20,10 +20,18 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline,
- UNet2DConditionModel)
-from ppdiffusers.utils.testing_utils import (load_numpy, nightly,
- require_paddle_gpu, slow)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LDMTextToImagePipeline,
+ UNet2DConditionModel,
+)
+from ppdiffusers.utils.testing_utils import (
+ load_numpy,
+ nightly,
+ require_paddle_gpu,
+ slow,
+)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -55,13 +63,15 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=(32, 64),
@@ -69,7 +79,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -80,10 +91,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -113,17 +124,19 @@ def test_inference_text2img(self):
image = pipe(**inputs).images
assert image.shape == (1, 64, 64, 3)
image_slice = image[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.28524342,
- 0.23806289,
- 0.38151595,
- 0.21939021,
- 0.26112252,
- 0.5172909,
- 0.25647423,
- 0.25049314,
- 0.47979864,
- ])
+ expected_slice = np.array(
+ [
+ 0.28524342,
+ 0.23806289,
+ 0.38151595,
+ 0.21939021,
+ 0.26112252,
+ 0.5172909,
+ 0.25647423,
+ 0.25049314,
+ 0.47979864,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
@@ -150,24 +163,25 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_ldm_default_ddim(self):
- pipe = LDMTextToImagePipeline.from_pretrained(
- "CompVis/ldm-text2im-large-256")
+ pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
- expected_slice = np.array([
- 0.51825,
- 0.5285,
- 0.52543,
- 0.54258,
- 0.52304,
- 0.52569,
- 0.54363,
- 0.55276,
- 0.56878,
- ])
+ expected_slice = np.array(
+ [
+ 0.51825,
+ 0.5285,
+ 0.52543,
+ 0.54258,
+ 0.52304,
+ 0.52569,
+ 0.54363,
+ 0.55276,
+ 0.56878,
+ ]
+ )
max_diff = np.abs(expected_slice - image_slice).max()
assert max_diff < 0.02
@@ -195,8 +209,7 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_ldm_default_ddim(self):
- pipe = LDMTextToImagePipeline.from_pretrained(
- "CompVis/ldm-text2im-large-256")
+ pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images[0]
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index 32472986acf44..aea2e7538f903 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -19,10 +19,8 @@
import numpy as np
import paddle
-from ppdiffusers import (DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel,
- VQModel)
-from ppdiffusers.utils import (PIL_INTERPOLATION, floats_tensor, load_image,
- slow)
+from ppdiffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
+from ppdiffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle
@@ -32,8 +30,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -46,7 +43,8 @@ def dummy_uncond_unet(self):
in_channels=6,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
@property
@@ -58,15 +56,15 @@ def dummy_vq_model(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=3, )
+ latent_channels=3,
+ )
return model
def test_inference_superresolution(self):
unet = self.dummy_uncond_unet
scheduler = DDIMScheduler()
vqvae = self.dummy_vq_model
- ldm = LDMSuperResolutionPipeline(
- unet=unet, vqvae=vqvae, scheduler=scheduler)
+ ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
ldm.set_progress_bar_config(disable=None)
init_image = self.dummy_image
generator = paddle.Generator().manual_seed(0)
@@ -74,20 +72,23 @@ def test_inference_superresolution(self):
image=init_image,
generator=generator,
num_inference_steps=2,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.12982202,
- 0.8338444,
- 0.46506804,
- 0.5459576,
- 0.6662215,
- 0.38444045,
- 0.72195464,
- 0.5719301,
- 0.36579454,
- ])
+ expected_slice = np.array(
+ [
+ 0.12982202,
+ 0.8338444,
+ 0.46506804,
+ 0.5459576,
+ 0.6662215,
+ 0.38444045,
+ 0.72195464,
+ 0.5719301,
+ 0.36579454,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_inference_superresolution_fp16(self):
@@ -96,12 +97,10 @@ def test_inference_superresolution_fp16(self):
vqvae = self.dummy_vq_model
unet = unet.to(dtype=paddle.float16)
vqvae = vqvae.to(dtype=paddle.float16)
- ldm = LDMSuperResolutionPipeline(
- unet=unet, vqvae=vqvae, scheduler=scheduler)
+ ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
ldm.set_progress_bar_config(disable=None)
init_image = self.dummy_image
- image = ldm(init_image, num_inference_steps=2,
- output_type="numpy").images
+ image = ldm(init_image, num_inference_steps=2, output_type="numpy").images
assert image.shape == (1, 64, 64, 3)
@@ -112,21 +111,17 @@ def test_inference_superresolution(self):
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool.png"
)
- init_image = init_image.resize(
- (64, 64), resample=PIL_INTERPOLATION["lanczos"])
- ldm = LDMSuperResolutionPipeline.from_pretrained(
- "duongna/ldm-super-resolution")
+ init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
+ ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution")
ldm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
image = ldm(
image=init_image,
generator=generator,
num_inference_steps=20,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 256, 256, 3)
- expected_slice = np.array([
- 0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257,
- 0.6907
- ])
+ expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257, 0.6907])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
index 5ad34d0481b67..89319ee92bcb2 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
@@ -34,7 +34,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
@property
@@ -46,7 +47,8 @@ def dummy_vq_model(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=3, )
+ latent_channels=3,
+ )
return model
@property
@@ -61,7 +63,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
def test_inference_uncond(self):
@@ -71,33 +74,33 @@ def test_inference_uncond(self):
ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
ldm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ldm(generator=generator,
- num_inference_steps=2,
- output_type="numpy").images
+ image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = ldm(
generator=generator,
num_inference_steps=2,
output_type="numpy",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.827049,
- 1.0,
- 0.6244688,
- 0.7729403,
- 1.0,
- 0.73071766,
- 0.6108738,
- 0.9107263,
- 0.7249622,
- ])
+ expected_slice = np.array(
+ [
+ 0.827049,
+ 1.0,
+ 0.6244688,
+ 0.7729403,
+ 1.0,
+ 0.73071766,
+ 0.6108738,
+ 0.9107263,
+ 0.7249622,
+ ]
+ )
tolerance = 0.01
assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
- assert (np.abs(image_from_tuple_slice.flatten() - expected_slice).max()
- < tolerance)
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
@slow
@@ -107,21 +110,21 @@ def test_inference_uncond(self):
ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
ldm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ldm(generator=generator,
- num_inference_steps=5,
- output_type="numpy").images
+ image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 256, 256, 3)
- expected_slice = np.array([
- 0.59802866,
- 0.61698544,
- 0.62753576,
- 0.6128236,
- 0.60961217,
- 0.617262,
- 0.6060791,
- 0.60261935,
- 0.6129079,
- ])
+ expected_slice = np.array(
+ [
+ 0.59802866,
+ 0.61698544,
+ 0.62753576,
+ 0.6128236,
+ 0.60961217,
+ 0.617262,
+ 0.6060791,
+ 0.60261935,
+ 0.6129079,
+ ]
+ )
tolerance = 0.01
assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
diff --git a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
index 35c1718941567..00025bde5002d 100644
--- a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -22,14 +22,20 @@
from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
from PIL import Image
-from ppdiffusers import (AutoencoderKL, PaintByExamplePipeline, PNDMScheduler,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ PaintByExamplePipeline,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+ IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -48,7 +54,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -57,7 +64,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
config = CLIPVisionConfig(
hidden_size=32,
@@ -67,7 +75,8 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
image_size=32,
- patch_size=4, )
+ patch_size=4,
+ )
image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
@@ -93,13 +102,9 @@ def test_save_load_float16(self):
def get_dummy_inputs(self, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (64, 64)))
- example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (32, 32))
+ init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+ example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
generator = paddle.Generator().manual_seed(seed)
inputs = {
@@ -122,17 +127,19 @@ def test_paint_by_example_inpaint(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.82595694,
- 0.51862055,
- 0.5474039,
- 0.2411496,
- 0.20220888,
- 0.3430622,
- 0.3558151,
- 0.06606945,
- 0.4550809,
- ])
+ expected_slice = np.array(
+ [
+ 0.82595694,
+ 0.51862055,
+ 0.5474039,
+ 0.2411496,
+ 0.20220888,
+ 0.3430622,
+ 0.3558151,
+ 0.06606945,
+ 0.4550809,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_paint_by_example_image_tensor(self):
@@ -172,8 +179,7 @@ def test_paint_by_example(self):
example_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/panda.jpg"
)
- pipe = PaintByExamplePipeline.from_pretrained(
- "Fantasy-Studio/Paint-by-Example")
+ pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(seed=321)
output = pipe(
@@ -183,12 +189,10 @@ def test_paint_by_example(self):
generator=generator,
guidance_scale=5.0,
num_inference_steps=50,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529,
- 0.5374
- ])
+ expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529, 0.5374])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
diff --git a/ppdiffusers/tests/pipelines/pipeline_params.py b/ppdiffusers/tests/pipelines/pipeline_params.py
index 33b041a173248..9f835e6e783cc 100644
--- a/ppdiffusers/tests/pipelines/pipeline_params.py
+++ b/ppdiffusers/tests/pipelines/pipeline_params.py
@@ -22,80 +22,89 @@
# I.e. a text to image pipeline with non-configurable height and width arguments
# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
-TEXT_TO_IMAGE_PARAMS = frozenset([
- "prompt",
- "height",
- "width",
- "guidance_scale",
- "negative_prompt",
- "prompt_embeds",
- "negative_prompt_embeds",
- "cross_attention_kwargs",
-])
+TEXT_TO_IMAGE_PARAMS = frozenset(
+ [
+ "prompt",
+ "height",
+ "width",
+ "guidance_scale",
+ "negative_prompt",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ "cross_attention_kwargs",
+ ]
+)
TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
-IMAGE_VARIATION_PARAMS = frozenset([
- "image",
- "height",
- "width",
- "guidance_scale",
-])
+IMAGE_VARIATION_PARAMS = frozenset(
+ [
+ "image",
+ "height",
+ "width",
+ "guidance_scale",
+ ]
+)
IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
-TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset([
- "prompt",
- "image",
- "height",
- "width",
- "guidance_scale",
- "negative_prompt",
- "prompt_embeds",
- "negative_prompt_embeds",
-])
-
-TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(
- ["prompt", "image", "negative_prompt"])
-
-TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([
- # Text guided image variation with an image mask
- "prompt",
- "image",
- "mask_image",
- "height",
- "width",
- "guidance_scale",
- "negative_prompt",
- "prompt_embeds",
- "negative_prompt_embeds",
-])
-
-TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(
- ["prompt", "image", "mask_image", "negative_prompt"])
-
-IMAGE_INPAINTING_PARAMS = frozenset([
- # image variation with an image mask
- "image",
- "mask_image",
- "height",
- "width",
- "guidance_scale",
-])
+TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
+ [
+ "prompt",
+ "image",
+ "height",
+ "width",
+ "guidance_scale",
+ "negative_prompt",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ ]
+)
+
+TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
+
+TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+ [
+ # Text guided image variation with an image mask
+ "prompt",
+ "image",
+ "mask_image",
+ "height",
+ "width",
+ "guidance_scale",
+ "negative_prompt",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ ]
+)
+
+TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
+
+IMAGE_INPAINTING_PARAMS = frozenset(
+ [
+ # image variation with an image mask
+ "image",
+ "mask_image",
+ "height",
+ "width",
+ "guidance_scale",
+ ]
+)
IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
-IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([
- "example_image",
- "image",
- "mask_image",
- "height",
- "width",
- "guidance_scale",
-])
+IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+ [
+ "example_image",
+ "image",
+ "mask_image",
+ "height",
+ "width",
+ "guidance_scale",
+ ]
+)
-IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(
- ["example_image", "image", "mask_image"])
+IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
@@ -109,15 +118,17 @@
UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
-TEXT_TO_AUDIO_PARAMS = frozenset([
- "prompt",
- "audio_length_in_s",
- "guidance_scale",
- "negative_prompt",
- "prompt_embeds",
- "negative_prompt_embeds",
- "cross_attention_kwargs",
-])
+TEXT_TO_AUDIO_PARAMS = frozenset(
+ [
+ "prompt",
+ "audio_length_in_s",
+ "guidance_scale",
+ "negative_prompt",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ "cross_attention_kwargs",
+ ]
+)
TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
diff --git a/ppdiffusers/tests/pipelines/pndm/test_pndm.py b/ppdiffusers/tests/pipelines/pndm/test_pndm.py
index 2255f43742f71..bfa6285a45d5f 100644
--- a/ppdiffusers/tests/pipelines/pndm/test_pndm.py
+++ b/ppdiffusers/tests/pipelines/pndm/test_pndm.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
pndm.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = pndm(
- generator=generator, num_inference_steps=20,
- output_type="numpy").images
+ image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = pndm(
generator=generator,
num_inference_steps=20,
output_type="numpy",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
@slow
@@ -73,15 +72,17 @@ def test_inference_cifar10(self):
image = pndm(generator=generator, output_type="numpy").images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.15949559211730957,
- 0.17172572016716003,
- 0.17315810918807983,
- 0.1836635172367096,
- 0.1823960244655609,
- 0.1799020767211914,
- 0.21776044368743896,
- 0.22992581129074097,
- 0.21678516268730164,
- ])
+ expected_slice = np.array(
+ [
+ 0.15949559211730957,
+ 0.17172572016716003,
+ 0.17315810918807983,
+ 0.1836635172367096,
+ 0.1823960244655609,
+ 0.1799020767211914,
+ 0.21776044368743896,
+ 0.22992581129074097,
+ 0.21678516268730164,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/repaint/test_repaint.py b/ppdiffusers/tests/pipelines/repaint/test_repaint.py
index 3bce3769af1be..9d27e3b1c5061 100644
--- a/ppdiffusers/tests/pipelines/repaint/test_repaint.py
+++ b/ppdiffusers/tests/pipelines/repaint/test_repaint.py
@@ -20,11 +20,14 @@
import paddle
from ppdiffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import (load_image, load_numpy, nightly,
- require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import (
+ load_image,
+ load_numpy,
+ nightly,
+ require_paddle_gpu,
+)
-from ..pipeline_params import (IMAGE_INPAINTING_BATCH_PARAMS,
- IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -49,7 +52,8 @@ def get_dummy_components(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
scheduler = RePaintScheduler()
components = {"unet": unet, "scheduler": scheduler}
return components
@@ -80,17 +84,19 @@ def test_repaint(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.08341709,
- 0.54262626,
- 0.549711,
- 0.00903523,
- 0.0,
- 1.0,
- 0.05136755,
- 0.5604646,
- 0.6273578,
- ])
+ expected_slice = np.array(
+ [
+ 0.08341709,
+ 0.54262626,
+ 0.549711,
+ 0.00903523,
+ 0.0,
+ 1.0,
+ 0.05136755,
+ 0.5604646,
+ 0.6273578,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
# RePaint can hardly be made deterministic since the scheduler is currently always
@@ -133,7 +139,8 @@ def test_celebahq(self):
jump_length=10,
jump_n_sample=10,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
assert image.shape == (256, 256, 3)
assert np.abs(expected_image - image).mean() < 0.01
diff --git a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index f3b799000aa41..97af9d23e974c 100644
--- a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
sde_ve.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = sde_ve(
- num_inference_steps=2, output_type="numpy",
- generator=generator).images
+ image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sde_ve(
num_inference_steps=2,
output_type="numpy",
generator=generator,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
@slow
@@ -70,9 +69,7 @@ def test_inference(self):
sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
sde_ve.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = sde_ve(
- num_inference_steps=10, output_type="numpy",
- generator=generator).images
+ image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0])
diff --git a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
index 6188cab488e6a..cf7e0a7ba17a7 100644
--- a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
@@ -22,10 +22,16 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers.pipelines.semantic_stable_diffusion import \
- SemanticStableDiffusionPipeline as StableDiffusionPipeline
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.semantic_stable_diffusion import (
+ SemanticStableDiffusionPipeline as StableDiffusionPipeline,
+)
from ppdiffusers.utils import floats_tensor, nightly
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -41,8 +47,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -56,7 +61,8 @@ def dummy_cond_unet(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -68,7 +74,8 @@ def dummy_vae(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
return model
@property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
@property
@@ -108,11 +116,11 @@ def test_semantic_diffusion_ddim(self):
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -120,7 +128,8 @@ def test_semantic_diffusion_ddim(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -129,7 +138,8 @@ def test_semantic_diffusion_ddim(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -138,29 +148,31 @@ def test_semantic_diffusion_ddim(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.28401083,
- 0.23724163,
- 0.38141036,
- 0.2201719,
- 0.26111937,
- 0.5176592,
- 0.25668317,
- 0.25036532,
- 0.47986418,
- ])
+ expected_slice = np.array(
+ [
+ 0.28401083,
+ 0.23724163,
+ 0.38141036,
+ 0.2201719,
+ 0.26111937,
+ 0.5176592,
+ 0.25668317,
+ 0.25036532,
+ 0.47986418,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_semantic_diffusion_no_safety_checker(self):
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+ )
assert isinstance(pipe, StableDiffusionPipeline)
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
assert pipe.safety_checker is None
@@ -168,8 +180,7 @@ def test_semantic_diffusion_no_safety_checker(self):
assert image is not None
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = StableDiffusionPipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
@@ -179,8 +190,7 @@ def test_semantic_diffusion_pndm(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -188,7 +198,8 @@ def test_semantic_diffusion_pndm(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -197,7 +208,8 @@ def test_semantic_diffusion_pndm(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -206,24 +218,26 @@ def test_semantic_diffusion_pndm(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.18612236,
- 0.24176982,
- 0.36099488,
- 0.21807766,
- 0.27262795,
- 0.51991826,
- 0.22258872,
- 0.22143877,
- 0.4452843,
- ])
+ expected_slice = np.array(
+ [
+ 0.18612236,
+ 0.24176982,
+ 0.36099488,
+ 0.21807766,
+ 0.27262795,
+ 0.51991826,
+ 0.22258872,
+ 0.22143877,
+ 0.4452843,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.02
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.02
def test_semantic_diffusion_fp16(self):
"""Test that stable diffusion works with fp16"""
@@ -231,8 +245,7 @@ def test_semantic_diffusion_fp16(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
unet = unet.to(dtype=paddle.float16)
vae = vae.to(dtype=paddle.float16)
bert = bert.to(dtype=paddle.float16)
@@ -243,11 +256,11 @@ def test_semantic_diffusion_fp16(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
- image = sd_pipe(
- [prompt], num_inference_steps=2, output_type="np").images
+ image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
assert image.shape == (1, 64, 64, 3)
@@ -260,8 +273,7 @@ def tearDown(self):
# paddle.device.cuda.empty_cache()
def test_positive_guidance(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe.set_progress_bar_config(disable=None)
prompt = "a photo of a cat"
edit = {
@@ -283,7 +295,8 @@ def test_positive_guidance(self):
num_inference_steps=50,
output_type="np",
width=512,
- height=512, )
+ height=512,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -308,7 +321,8 @@ def test_positive_guidance(self):
output_type="np",
width=512,
height=512,
- **edit, )
+ **edit,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -326,8 +340,7 @@ def test_positive_guidance(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_negative_guidance(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe.set_progress_bar_config(disable=None)
prompt = "an image of a crowded boulevard, realistic, 4k"
edit = {
@@ -349,7 +362,8 @@ def test_negative_guidance(self):
num_inference_steps=50,
output_type="np",
width=512,
- height=512, )
+ height=512,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -374,7 +388,8 @@ def test_negative_guidance(self):
output_type="np",
width=512,
height=512,
- **edit, )
+ **edit,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -392,13 +407,11 @@ def test_negative_guidance(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_multi_cond_guidance(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe.set_progress_bar_config(disable=None)
prompt = "a castle next to a river"
edit = {
- "editing_prompt":
- ["boat on a river, boat", "monet, impression, sunrise"],
+ "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
"reverse_editing_direction": False,
"edit_warmup_steps": [15, 18],
"edit_guidance_scale": 6,
@@ -416,7 +429,8 @@ def test_multi_cond_guidance(self):
num_inference_steps=50,
output_type="np",
width=512,
- height=512, )
+ height=512,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -441,7 +455,8 @@ def test_multi_cond_guidance(self):
output_type="np",
width=512,
height=512,
- **edit, )
+ **edit,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -459,8 +474,7 @@ def test_multi_cond_guidance(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
def test_guidance_fp16(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
prompt = "a photo of a cat"
edit = {
@@ -482,7 +496,8 @@ def test_guidance_fp16(self):
num_inference_steps=50,
output_type="np",
width=512,
- height=512, )
+ height=512,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -507,7 +522,8 @@ def test_guidance_fp16(self):
output_type="np",
width=512,
height=512,
- **edit, )
+ **edit,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
diff --git a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 9355b00dcdff0..465b997e0c007 100644
--- a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -18,15 +18,19 @@
import numpy as np
import paddle
-from ppdiffusers import (DDPMScheduler, MidiProcessor,
- SpectrogramDiffusionPipeline)
+from ppdiffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
from ppdiffusers.pipelines.spectrogram_diffusion import (
- SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder)
+ SpectrogramContEncoder,
+ SpectrogramNotesEncoder,
+ T5FilmDecoder,
+)
from ppdiffusers.training_utils import enable_full_determinism
from ppdiffusers.utils import require_paddle_gpu, slow
-from ..pipeline_params import (TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS,
- TOKENS_TO_AUDIO_GENERATION_PARAMS)
+from ..pipeline_params import (
+ TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS,
+ TOKENS_TO_AUDIO_GENERATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism(42)
@@ -38,8 +42,7 @@
# is not compatible with python 3.8 which we run in the CI.
# https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98
# @unittest.skip("The note-seq package currently throws an error on import")
-class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = SpectrogramDiffusionPipeline
required_optional_params = PipelineTesterMixin.required_optional_params - {
"callback",
@@ -65,7 +68,8 @@ def get_dummy_components(self):
num_heads=1,
d_kv=4,
d_ff=2048,
- feed_forward_proj="gated-gelu", )
+ feed_forward_proj="gated-gelu",
+ )
notes_encoder.eval()
paddle.seed(0)
continuous_encoder = SpectrogramContEncoder(
@@ -77,7 +81,8 @@ def get_dummy_components(self):
num_heads=1,
d_kv=4,
d_ff=2048,
- feed_forward_proj="gated-gelu", )
+ feed_forward_proj="gated-gelu",
+ )
continuous_encoder.eval()
paddle.seed(0)
@@ -90,7 +95,8 @@ def get_dummy_components(self):
num_heads=1,
d_kv=4,
d_ff=2048,
- dropout_rate=0.1, )
+ dropout_rate=0.1,
+ )
decoder.eval()
scheduler = DDPMScheduler()
@@ -108,23 +114,26 @@ def get_dummy_inputs(self, seed=0):
generator = paddle.Generator().manual_seed(seed)
inputs = {
- "input_tokens": [[
- 1134,
- 90,
- 1135,
- 1133,
- 1080,
- 112,
- 1132,
- 1080,
- 1133,
- 1079,
- 133,
- 1132,
- 1079,
- 1133,
- 1,
- ] + [0] * 2033],
+ "input_tokens": [
+ [
+ 1134,
+ 90,
+ 1135,
+ 1133,
+ 1080,
+ 112,
+ 1132,
+ 1080,
+ 1133,
+ 1079,
+ 133,
+ 1132,
+ 1079,
+ 1133,
+ 1,
+ ]
+ + [0] * 2033
+ ],
"generator": generator,
"num_inference_steps": 4,
"output_type": "mel",
@@ -144,17 +153,19 @@ def test_spectrogram_diffusion(self):
mel_slice = mel[0, -3:, -3:]
assert mel_slice.shape == (3, 3)
- expected_slice = np.array([
- -11.46511,
- 4.0,
- -8.506372,
- -11.512925,
- -11.512925,
- -10.417862,
- -8.077912,
- 3.7985802,
- 4.0,
- ])
+ expected_slice = np.array(
+ [
+ -11.46511,
+ 4.0,
+ -8.506372,
+ -11.512925,
+ -11.512925,
+ -10.417862,
+ -8.077912,
+ 3.7985802,
+ 4.0,
+ ]
+ )
assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
def test_save_load_local(self):
@@ -191,8 +202,7 @@ def tearDown(self):
def test_callback(self):
# TODO - test that pipeline can decode tokens in a callback
# so that music can be played live
- pipe = SpectrogramDiffusionPipeline.from_pretrained(
- "google/music-spectrogram-diffusion")
+ pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
melgan = pipe.melgan
pipe.melgan = None
@@ -215,12 +225,12 @@ def callback(step, mel_output):
num_inference_steps=5,
generator=generator,
callback=callback,
- output_type="mel", )
+ output_type="mel",
+ )
def test_spectrogram_fast(self):
- pipe = SpectrogramDiffusionPipeline.from_pretrained(
- "google/music-spectrogram-diffusion")
+ pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
pipe.set_progress_bar_config(disable=None)
processor = MidiProcessor()
@@ -237,8 +247,7 @@ def test_spectrogram_fast(self):
def test_spectrogram(self):
- pipe = SpectrogramDiffusionPipeline.from_pretrained(
- "google/music-spectrogram-diffusion")
+ pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
pipe.set_progress_bar_config(disable=None)
processor = MidiProcessor()
@@ -249,8 +258,7 @@ def test_spectrogram(self):
input_tokens = input_tokens[:4]
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- input_tokens, num_inference_steps=100, generator=generator)
+ output = pipe(input_tokens, num_inference_steps=100, generator=generator)
audio = output.audios[0]
assert abs(np.abs(audio).sum() - 14418.089) < 5e-2
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index f5beae09ac46f..50c27ff574be4 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -21,13 +21,19 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ CycleDiffusionPipeline,
+ DDIMScheduler,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -39,11 +45,8 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
"width",
"negative_prompt_embeds",
}
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
- batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union(
- {"source_prompt"})
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"})
def get_dummy_components(self):
paddle.seed(0)
@@ -55,14 +58,16 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -70,7 +75,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -81,10 +87,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -123,17 +129,19 @@ def test_stable_diffusion_cycle(self):
images = output.images
image_slice = images[0, -3:, -3:, -1]
assert images.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.04812625,
- 0.77983606,
- 0.71009433,
- 0.15924984,
- 0.9788434,
- 0.49732354,
- 0.362224,
- 0.6481595,
- 0.4530744,
- ])
+ expected_slice = np.array(
+ [
+ 0.04812625,
+ 0.77983606,
+ 0.71009433,
+ 0.15924984,
+ 0.9788434,
+ 0.49732354,
+ 0.362224,
+ 0.6481595,
+ 0.4530744,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_cycle_fp16(self):
@@ -148,17 +156,19 @@ def test_stable_diffusion_cycle_fp16(self):
images = output.images
image_slice = images[0, -3:, -3:, -1]
assert images.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.05053711,
- 0.78125,
- 0.7114258,
- 0.15991211,
- 0.9785156,
- 0.49804688,
- 0.36279297,
- 0.6484375,
- 0.45361328,
- ])
+ expected_slice = np.array(
+ [
+ 0.05053711,
+ 0.78125,
+ 0.7114258,
+ 0.15991211,
+ 0.9785156,
+ 0.49804688,
+ 0.36279297,
+ 0.6484375,
+ 0.45361328,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
@unittest.skip("non-deterministic pipeline")
@@ -178,18 +188,17 @@ def test_cycle_diffusion_pipeline_fp16(self):
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
)
- expected_image = np.array([[0.14477539, 0.20483398, 0.14135742],
- [0.10009766, 0.17602539, 0.11083984]])
+ expected_image = np.array([[0.14477539, 0.20483398, 0.14135742], [0.10009766, 0.17602539, 0.11083984]])
init_image = init_image.resize((512, 512))
model_id = "CompVis/stable-diffusion-v1-4"
- scheduler = DDIMScheduler.from_pretrained(
- model_id, subfolder="scheduler")
+ scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(
model_id,
scheduler=scheduler,
safety_checker=None,
paddle_dtype=paddle.float16,
- revision="fp16", )
+ revision="fp16",
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
source_prompt = "A black colored car"
@@ -205,7 +214,8 @@ def test_cycle_diffusion_pipeline_fp16(self):
guidance_scale=3,
source_guidance_scale=1,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
assert np.abs(image[0][0][:2] - expected_image).max() < 0.5
@@ -213,14 +223,11 @@ def test_cycle_diffusion_pipeline(self):
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
)
- expected_image = np.array([[0.16294342, 0.20514232, 0.14554858],
- [0.11476257, 0.16831946, 0.11495486]])
+ expected_image = np.array([[0.16294342, 0.20514232, 0.14554858], [0.11476257, 0.16831946, 0.11495486]])
init_image = init_image.resize((512, 512))
model_id = "CompVis/stable-diffusion-v1-4"
- scheduler = DDIMScheduler.from_pretrained(
- model_id, subfolder="scheduler")
- pipe = CycleDiffusionPipeline.from_pretrained(
- model_id, scheduler=scheduler, safety_checker=None)
+ scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
+ pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
source_prompt = "A black colored car"
@@ -236,6 +243,7 @@ def test_cycle_diffusion_pipeline(self):
guidance_scale=3,
source_guidance_scale=1,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
assert np.abs(image[0][0][:2] - expected_image).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 184bd9f7b4927..042ad47fa00eb 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -22,10 +22,17 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline,
- UNet2DConditionModel, logging)
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+ logging,
+)
from ppdiffusers.utils import load_numpy, nightly, slow
from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
@@ -49,13 +56,15 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -63,7 +72,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -74,10 +84,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -110,17 +120,19 @@ def test_stable_diffusion_ddim(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.28519553,
- 0.23807192,
- 0.38150552,
- 0.21930423,
- 0.26092762,
- 0.51721215,
- 0.25639117,
- 0.25039536,
- 0.47978917,
- ])
+ expected_slice = np.array(
+ [
+ 0.28519553,
+ 0.23807192,
+ 0.38150552,
+ 0.21930423,
+ 0.26092762,
+ 0.51721215,
+ 0.25639117,
+ 0.25039536,
+ 0.47978917,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_lora(self):
@@ -159,14 +171,14 @@ def test_stable_diffusion_prompt_embeds(self):
padding="max_length",
max_length=sd_pipe.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_inputs = text_inputs["input_ids"]
prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
inputs["prompt_embeds"] = prompt_embeds
output = sd_pipe(**inputs)
image_slice_2 = output.images[0, -3:, -3:, -1]
- assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max(
- ) < 0.0001
+ assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
def test_stable_diffusion_negative_prompt_embeds(self):
components = self.get_dummy_components()
@@ -187,14 +199,14 @@ def test_stable_diffusion_negative_prompt_embeds(self):
padding="max_length",
max_length=sd_pipe.tokenizer.model_max_length,
truncation=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_inputs = text_inputs["input_ids"]
embeds.append(sd_pipe.text_encoder(text_inputs)[0])
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
output = sd_pipe(**inputs)
image_slice_2 = output.images[0, -3:, -3:, -1]
- assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max(
- ) < 0.0001
+ assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
def test_stable_diffusion_ddim_factor_8(self):
components = self.get_dummy_components()
@@ -205,17 +217,19 @@ def test_stable_diffusion_ddim_factor_8(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 136, 136, 3)
- expected_slice = np.array([
- 0.39545745,
- 0.94682777,
- 0.6828775,
- 0.42496994,
- 0.49475053,
- 0.48353004,
- 0.27300328,
- 0.30724254,
- 0.50566095,
- ])
+ expected_slice = np.array(
+ [
+ 0.39545745,
+ 0.94682777,
+ 0.6828775,
+ 0.42496994,
+ 0.49475053,
+ 0.48353004,
+ 0.27300328,
+ 0.30724254,
+ 0.50566095,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
@@ -229,23 +243,25 @@ def test_stable_diffusion_pndm(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.18620703,
- 0.24143961,
- 0.3609084,
- 0.21810293,
- 0.27230006,
- 0.51992655,
- 0.22248471,
- 0.2213102,
- 0.44538254,
- ])
+ expected_slice = np.array(
+ [
+ 0.18620703,
+ 0.24143961,
+ 0.3609084,
+ 0.21810293,
+ 0.27230006,
+ 0.51992655,
+ 0.22248471,
+ 0.2213102,
+ 0.44538254,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_no_safety_checker(self):
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+ )
assert isinstance(pipe, StableDiffusionPipeline)
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
assert pipe.safety_checker is None
@@ -253,8 +269,7 @@ def test_stable_diffusion_no_safety_checker(self):
assert image is not None
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = StableDiffusionPipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
@@ -262,80 +277,82 @@ def test_stable_diffusion_no_safety_checker(self):
def test_stable_diffusion_k_lms(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionPipeline(**components)
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output = sd_pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.29910105,
- 0.22905633,
- 0.37701294,
- 0.21332851,
- 0.26000416,
- 0.52840894,
- 0.25865072,
- 0.25947532,
- 0.47509664,
- ])
+ expected_slice = np.array(
+ [
+ 0.29910105,
+ 0.22905633,
+ 0.37701294,
+ 0.21332851,
+ 0.26000416,
+ 0.52840894,
+ 0.25865072,
+ 0.25947532,
+ 0.47509664,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_k_euler_ancestral(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionPipeline(**components)
- sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output = sd_pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.29917336,
- 0.22854236,
- 0.37669897,
- 0.2137424,
- 0.25940597,
- 0.528258,
- 0.25919583,
- 0.2594489,
- 0.47522712,
- ])
+ expected_slice = np.array(
+ [
+ 0.29917336,
+ 0.22854236,
+ 0.37669897,
+ 0.2137424,
+ 0.25940597,
+ 0.528258,
+ 0.25919583,
+ 0.2594489,
+ 0.47522712,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_k_euler(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionPipeline(**components)
- sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output = sd_pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.29910135,
- 0.22905621,
- 0.3770129,
- 0.21332836,
- 0.26000386,
- 0.52840906,
- 0.2586509,
- 0.2594754,
- 0.47509673,
- ])
+ expected_slice = np.array(
+ [
+ 0.29910135,
+ 0.22905621,
+ 0.3770129,
+ 0.21332836,
+ 0.26000386,
+ 0.52840906,
+ 0.2586509,
+ 0.2594754,
+ 0.47509673,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_vae_slicing(self):
components = self.get_dummy_components()
- components["scheduler"] = LMSDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
image_count = 4
@@ -346,9 +363,7 @@ def test_stable_diffusion_vae_slicing(self):
inputs = self.get_dummy_inputs()
inputs["prompt"] = [inputs["prompt"]] * image_count
output_2 = sd_pipe(**inputs)
- assert (
- np.abs(output_2.images.flatten() - output_1.images.flatten()).max()
- < 0.003)
+ assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 0.003
def test_stable_diffusion_vae_tiling(self):
components = self.get_dummy_components()
@@ -367,7 +382,8 @@ def test_stable_diffusion_vae_tiling(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
# make sure tiled vae decode yields the same result
sd_pipe.enable_vae_tiling()
@@ -377,11 +393,10 @@ def test_stable_diffusion_vae_tiling(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
- assert (
- np.abs(output_2.images.flatten() - output_1.images.flatten()).max()
- < 5e-1)
+ assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
def test_stable_diffusion_negative_prompt(self):
components = self.get_dummy_components()
@@ -394,17 +409,19 @@ def test_stable_diffusion_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.16709289,
- 0.26912582,
- 0.35834038,
- 0.23045751,
- 0.30960953,
- 0.5324909,
- 0.20372942,
- 0.2368694,
- 0.43633103,
- ])
+ expected_slice = np.array(
+ [
+ 0.16709289,
+ 0.26912582,
+ 0.35834038,
+ 0.23045751,
+ 0.30960953,
+ 0.5324909,
+ 0.20372942,
+ 0.2368694,
+ 0.43633103,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_num_images_per_prompt(self):
@@ -416,59 +433,59 @@ def test_stable_diffusion_num_images_per_prompt(self):
images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
assert images.shape == (1, 64, 64, 3)
batch_size = 2
- images = sd_pipe(
- [prompt] * batch_size, num_inference_steps=2,
- output_type="np").images
+ images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
assert images.shape == (batch_size, 64, 64, 3)
num_images_per_prompt = 2
images = sd_pipe(
prompt,
num_inference_steps=2,
output_type="np",
- num_images_per_prompt=num_images_per_prompt, ).images
+ num_images_per_prompt=num_images_per_prompt,
+ ).images
assert images.shape == (num_images_per_prompt, 64, 64, 3)
batch_size = 2
images = sd_pipe(
[prompt] * batch_size,
num_inference_steps=2,
output_type="np",
- num_images_per_prompt=num_images_per_prompt, ).images
+ num_images_per_prompt=num_images_per_prompt,
+ ).images
assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
def test_stable_diffusion_long_prompt(self):
components = self.get_dummy_components()
- components["scheduler"] = LMSDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
do_classifier_free_guidance = True
negative_prompt = None
num_images_per_prompt = 1
- logger = logging.get_logger(
- "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+ logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
prompt = 25 * "@"
with CaptureLogger(logger) as cap_logger_3:
text_embeddings_3 = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
+ negative_prompt,
+ )
prompt = 100 * "@"
with CaptureLogger(logger) as cap_logger:
text_embeddings = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
+ negative_prompt,
+ )
negative_prompt = "Hello"
with CaptureLogger(logger) as cap_logger_2:
text_embeddings_2 = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
- assert (text_embeddings_3.shape == text_embeddings_2.shape ==
- text_embeddings.shape)
+ negative_prompt,
+ )
+ assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
assert text_embeddings.shape[1] == 77
assert cap_logger.out == cap_logger_2.out
assert cap_logger.out.count("@") == 25
@@ -476,20 +493,14 @@ def test_stable_diffusion_long_prompt(self):
def test_stable_diffusion_height_width_opt(self):
components = self.get_dummy_components()
- components["scheduler"] = LMSDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "hey"
output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
image_shape = output.images[0].shape[:2]
assert image_shape == (64, 64)
- output = sd_pipe(
- prompt,
- num_inference_steps=1,
- height=96,
- width=96,
- output_type="np")
+ output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
image_shape = output.images[0].shape[:2]
assert image_shape == (96, 96)
config = dict(sd_pipe.unet.config)
@@ -523,113 +534,116 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_stable_diffusion_1_1_pndm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-1")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.43625,
- 0.43554,
- 0.3667,
- 0.4066,
- 0.39703,
- 0.38658,
- 0.43936,
- 0.43557,
- 0.40592,
- ])
+ expected_slice = np.array(
+ [
+ 0.43625,
+ 0.43554,
+ 0.3667,
+ 0.4066,
+ 0.39703,
+ 0.38658,
+ 0.43936,
+ 0.43557,
+ 0.40592,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_1_4_pndm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.574,
- 0.47841,
- 0.31625,
- 0.63583,
- 0.58306,
- 0.55056,
- 0.50825,
- 0.56306,
- 0.55748,
- ])
+ expected_slice = np.array(
+ [
+ 0.574,
+ 0.47841,
+ 0.31625,
+ 0.63583,
+ 0.58306,
+ 0.55056,
+ 0.50825,
+ 0.56306,
+ 0.55748,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_ddim(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.38019,
- 0.28647,
- 0.27321,
- 0.40377,
- 0.3829,
- 0.35446,
- 0.39218,
- 0.38165,
- 0.42239,
- ])
+ expected_slice = np.array(
+ [
+ 0.38019,
+ 0.28647,
+ 0.27321,
+ 0.40377,
+ 0.3829,
+ 0.35446,
+ 0.39218,
+ 0.38165,
+ 0.42239,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_lms(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.10542,
- 0.0962,
- 0.07332,
- 0.09015,
- 0.09382,
- 0.07597,
- 0.08496,
- 0.07806,
- 0.06455,
- ])
+ expected_slice = np.array(
+ [
+ 0.10542,
+ 0.0962,
+ 0.07332,
+ 0.09015,
+ 0.09382,
+ 0.07597,
+ 0.08496,
+ 0.07806,
+ 0.06455,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_dpm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.03503,
- 0.03494,
- 0.01087,
- 0.03128,
- 0.02552,
- 0.00803,
- 0.00742,
- 0.00372,
- 0.0,
- ])
+ expected_slice = np.array(
+ [
+ 0.03503,
+ 0.03494,
+ 0.01087,
+ 0.03128,
+ 0.02552,
+ 0.00803,
+ 0.00742,
+ 0.00372,
+ 0.0,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
# def test_stable_diffusion_attention_slicing(self):
@@ -670,8 +684,7 @@ def test_stable_diffusion_dpm(self):
# assert np.abs(image_sliced - image).max() < 0.01
def test_stable_diffusion_fp16_vs_autocast(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(dtype="float16")
image_fp16 = pipe(**inputs).images
@@ -684,8 +697,7 @@ def test_stable_diffusion_fp16_vs_autocast(self):
def test_stable_diffusion_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -693,40 +705,41 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.5693,
- -0.3018,
- -0.9746,
- 0.0518,
- -0.877,
- 0.7559,
- -1.7402,
- 0.1022,
- 1.1582,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.5693,
+ -0.3018,
+ -0.9746,
+ 0.0518,
+ -0.877,
+ 0.7559,
+ -1.7402,
+ 0.1022,
+ 1.1582,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.1958,
- -0.2993,
- -1.0166,
- -0.5005,
- -0.481,
- 0.6162,
- -0.9492,
- 0.6621,
- 1.4492,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.1958,
+ -0.2993,
+ -1.0166,
+ -0.5005,
+ -0.481,
+ 0.6162,
+ -0.9492,
+ 0.6621,
+ 1.4492,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
- pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
@@ -758,8 +771,7 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_stable_diffusion_1_4_pndm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -770,22 +782,22 @@ def test_stable_diffusion_1_4_pndm(self):
assert max_diff < 0.001
def test_stable_diffusion_1_5_pndm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([
- [0.7839468, 0.6564859, 0.48896512],
- [0.78088367, 0.6400461, 0.447728],
- [0.81458974, 0.67865074, 0.51496047],
- ])
+ expected_image = np.array(
+ [
+ [0.7839468, 0.6564859, 0.48896512],
+ [0.78088367, 0.6400461, 0.447728],
+ [0.81458974, 0.67865074, 0.51496047],
+ ]
+ )
max_diff = np.abs(expected_image - image[0][0:3]).max()
assert max_diff < 0.001
def test_stable_diffusion_ddim(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
@@ -797,10 +809,8 @@ def test_stable_diffusion_ddim(self):
assert max_diff < 0.001
def test_stable_diffusion_lms(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -811,34 +821,34 @@ def test_stable_diffusion_lms(self):
assert max_diff < 0.001
def test_stable_diffusion_euler(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
- sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+ sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([
- [0.7907467, 0.69895816, 0.5911293],
- [0.7878128, 0.6815276, 0.55695873],
- [0.79491043, 0.69076216, 0.58900857],
- ])
+ expected_image = np.array(
+ [
+ [0.7907467, 0.69895816, 0.5911293],
+ [0.7878128, 0.6815276, 0.55695873],
+ [0.79491043, 0.69076216, 0.58900857],
+ ]
+ )
max_diff = np.abs(expected_image - image[0][0:3]).max()
assert max_diff < 0.001
def test_stable_diffusion_dpm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
inputs["num_inference_steps"] = 25
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([
- [0.8398815, 0.7510048, 0.6475117],
- [0.8548264, 0.75703114, 0.63529825],
- [0.8559129, 0.75676, 0.6597851],
- ])
+ expected_image = np.array(
+ [
+ [0.8398815, 0.7510048, 0.6475117],
+ [0.8548264, 0.75703114, 0.63529825],
+ [0.8559129, 0.75676, 0.6597851],
+ ]
+ )
max_diff = np.abs(expected_image - image[0][0:3]).max()
assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
index 5c19060a6d83a..4a6a51ef4cefb 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
@@ -20,19 +20,24 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, PNDMScheduler,
- StableDiffusionAdapterPipeline, T2IAdapter,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ PNDMScheduler,
+ StableDiffusionAdapterPipeline,
+ T2IAdapter,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, load_numpy, slow
from ppdiffusers.utils.import_utils import is_ppxformers_available
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionAdapterPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
@@ -47,7 +52,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.Generator().manual_seed(seed=0)
vae = AutoencoderKL(
@@ -56,7 +62,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
vae_scale_factor = 2
paddle.Generator().manual_seed(seed=0)
text_encoder_config = CLIPTextConfig(
@@ -68,10 +75,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config)
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
paddle.Generator().manual_seed(seed=0)
adapter = T2IAdapter(
block_out_channels=[32, 64],
@@ -80,7 +87,8 @@ def get_dummy_components(self):
kernel_size=1,
res_block_skip=True,
use_conv=False,
- input_scale_factor=vae_scale_factor, )
+ input_scale_factor=vae_scale_factor,
+ )
components = {
"adapter": adapter,
"unet": unet,
@@ -114,30 +122,30 @@ def test_stable_diffusion_adapter_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[(0), -3:, -3:, (-1)]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.9088084,
- 0.6012194,
- 0.43046606,
- 0.7228667,
- 0.46428588,
- 0.30164504,
- 0.508494,
- 0.6241546,
- 0.55453974,
- ])
+ expected_slice = np.array(
+ [
+ 0.9088084,
+ 0.6012194,
+ 0.43046606,
+ 0.7228667,
+ 0.46428588,
+ 0.30164504,
+ 0.508494,
+ 0.6241546,
+ 0.55453974,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
def test_attention_slicing_forward_pass(self):
- return self._test_attention_slicing_forward_pass(
- expected_max_diff=0.002)
+ return self._test_attention_slicing_forward_pass(expected_max_diff=0.002)
@unittest.skipIf(
not is_ppxformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=0.002)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=0.002)
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=0.002)
@@ -153,16 +161,12 @@ def tearDown(self):
def get_inputs(self, revision="segmentation", dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
image_urls = {
- "segmentation":
- "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png",
- "keypose":
- "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png",
- "depth":
- "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png",
+ "segmentation": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png",
+ "keypose": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png",
+ "depth": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png",
}
prompt_by_rev = {
- "segmentation":
- "A black Honda motorcycle parked in front of a garage",
+ "segmentation": "A black Honda motorcycle parked in front of a garage",
"keypose": "An astronaut on the moon",
"depth": "An office room with nice view",
}
@@ -180,9 +184,8 @@ def get_inputs(self, revision="segmentation", dtype="float32", seed=0):
def test_stable_diffusion_segmentation_adapter(self):
adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-seg")
pipe = StableDiffusionAdapterPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4",
- adapter=adapter,
- safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(revision="segmentation")
@@ -196,9 +199,8 @@ def test_stable_diffusion_segmentation_adapter(self):
def test_stable_diffusion_keypose_adapter(self):
adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-keypose")
pipe = StableDiffusionAdapterPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4",
- adapter=adapter,
- safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(revision="keypose")
@@ -212,9 +214,8 @@ def test_stable_diffusion_keypose_adapter(self):
def test_stable_diffusion_depth_adapter(self):
adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-depth")
pipe = StableDiffusionAdapterPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4",
- adapter=adapter,
- safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(revision="depth")
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index ddebfd6234a13..8b85c7bd484db 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -19,9 +19,13 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
- StableDiffusionControlNetPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ StableDiffusionControlNetPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import load_image, load_numpy, randn_tensor, slow
from ppdiffusers.utils.import_utils import is_ppxformers_available
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -30,8 +34,7 @@
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -46,7 +49,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
paddle.seed(0)
controlnet = ControlNetModel(
block_out_channels=(32, 64),
@@ -54,14 +58,16 @@ def get_dummy_components(self):
in_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
cross_attention_dim=32,
- conditioning_embedding_out_channels=(16, 32), )
+ conditioning_embedding_out_channels=(16, 32),
+ )
paddle.seed(0)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -69,7 +75,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -80,10 +87,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
@@ -106,8 +113,10 @@ def get_dummy_inputs(self, seed=0):
1,
3,
32 * controlnet_embedder_scale_factor,
- 32 * controlnet_embedder_scale_factor, ),
- generator=generator, )
+ 32 * controlnet_embedder_scale_factor,
+ ),
+ generator=generator,
+ )
inputs = {
"prompt": "A painting of a squirrel eating a burger",
@@ -128,8 +137,7 @@ def test_attention_slicing_forward_pass(self):
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- expected_max_diff=1e-2)
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-2)
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
@@ -144,13 +152,11 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_canny(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-canny")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -172,13 +178,11 @@ def test_canny(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_depth(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-depth")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -200,13 +204,11 @@ def test_depth(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_hed(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-hed")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -228,13 +230,11 @@ def test_hed(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_mlsd(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-mlsd")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -256,13 +256,11 @@ def test_mlsd(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_normal(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-normal")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -284,13 +282,11 @@ def test_normal(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_openpose(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-openpose")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -312,13 +308,11 @@ def test_openpose(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_scribble(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-scribble")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(5)
@@ -340,13 +334,11 @@ def test_scribble(self):
assert np.abs(expected_image - image).max() < 5e-3
def test_seg(self):
- controlnet = ControlNetModel.from_pretrained(
- "lllyasviel/sd-controlnet-seg")
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5",
- safety_checker=None,
- controlnet=controlnet)
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(5)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 8739a78286b5f..a73cfcdbf1291 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -19,24 +19,28 @@
import numpy as np
import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPVisionConfig,
- CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+ CLIPImageProcessor,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler,
- PNDMScheduler, StableDiffusionImageVariationPipeline,
- UNet2DConditionModel)
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
- slow)
+from ppdiffusers import (
+ AutoencoderKL,
+ DPMSolverMultistepScheduler,
+ PNDMScheduler,
+ StableDiffusionImageVariationPipeline,
+ UNet2DConditionModel,
+)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS,
- IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionImageVariationPipeline
params = IMAGE_VARIATION_PARAMS
batch_params = IMAGE_VARIATION_BATCH_PARAMS
@@ -51,7 +55,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -60,7 +65,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
image_encoder_config = CLIPVisionConfig(
hidden_size=32,
@@ -70,7 +76,8 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
image_size=32,
- patch_size=4, )
+ patch_size=4,
+ )
image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
@@ -106,17 +113,19 @@ def test_stable_diffusion_img_variation_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.22073305,
- 0.22751817,
- 0.32176197,
- 0.26315716,
- 0.25681925,
- 0.41432184,
- 0.2454437,
- 0.10104704,
- 0.32165903,
- ])
+ expected_slice = np.array(
+ [
+ 0.22073305,
+ 0.22751817,
+ 0.32176197,
+ 0.26315716,
+ 0.25681925,
+ 0.41432184,
+ 0.2454437,
+ 0.10104704,
+ 0.32165903,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
@@ -130,17 +139,19 @@ def test_stable_diffusion_img_variation_multiple_images(self):
image = output.images
image_slice = image[-1, -3:, -3:, -1]
assert image.shape == (2, 64, 64, 3)
- expected_slice = np.array([
- 0.61040395,
- 0.7414253,
- 0.5950623,
- 0.5843509,
- 0.25609648,
- 0.28481025,
- 0.61782926,
- 0.3014974,
- 0.35131538,
- ])
+ expected_slice = np.array(
+ [
+ 0.61040395,
+ 0.7414253,
+ 0.5950623,
+ 0.5843509,
+ 0.25609648,
+ 0.28481025,
+ 0.61782926,
+ 0.3014974,
+ 0.35131538,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
@@ -154,9 +165,7 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
- )
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png")
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = paddle.to_tensor(latents).cast(dtype)
inputs = {
@@ -171,30 +180,32 @@ def get_inputs(self, dtype="float32", seed=0):
def test_stable_diffusion_img_variation_pipeline_default(self):
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
- "fusing/sd-image-variations-diffusers", safety_checker=None)
+ "fusing/sd-image-variations-diffusers", safety_checker=None
+ )
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.5717014670372009,
- 0.47024625539779663,
- 0.47462183237075806,
- 0.6388776898384094,
- 0.5250844359397888,
- 0.500831663608551,
- 0.638043999671936,
- 0.5769134163856506,
- 0.5223015546798706,
- ])
+ expected_slice = np.array(
+ [
+ 0.5717014670372009,
+ 0.47024625539779663,
+ 0.47462183237075806,
+ 0.6388776898384094,
+ 0.5250844359397888,
+ 0.500831663608551,
+ 0.638043999671936,
+ 0.5769134163856506,
+ 0.5223015546798706,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_img_variation_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -202,42 +213,45 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.1621,
- 0.2837,
- -0.7979,
- -0.1221,
- -1.3057,
- 0.7681,
- -2.1191,
- 0.0464,
- 1.6309,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.1621,
+ 0.2837,
+ -0.7979,
+ -0.1221,
+ -1.3057,
+ 0.7681,
+ -2.1191,
+ 0.0464,
+ 1.6309,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.6299,
- 1.75,
- 1.1992,
- -2.1582,
- -1.8994,
- 0.7334,
- -0.709,
- 1.0137,
- 1.5273,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.6299,
+ 1.75,
+ 1.1992,
+ -2.1582,
+ -1.8994,
+ 0.7334,
+ -0.709,
+ 1.0137,
+ 1.5273,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
pipe = StableDiffusionImageVariationPipeline.from_pretrained(
"fusing/sd-image-variations-diffusers",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
@@ -256,9 +270,7 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
- )
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png")
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = paddle.to_tensor(latents).cast(dtype)
inputs = {
@@ -272,28 +284,21 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_img_variation_pndm(self):
- sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
- "fusing/sd-image-variations-diffusers")
+ sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_img_variation_dpm(self):
- sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
- "fusing/sd-image-variations-diffusers")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
inputs["num_inference_steps"] = 25
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 86f394a233323..101468b9a4534 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -21,27 +21,30 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- DPMSolverMultistepScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionImg2ImgPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionImg2ImgPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
- slow)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
def get_dummy_components(self):
@@ -54,7 +57,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -63,7 +67,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -74,10 +79,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -101,8 +106,7 @@ def get_dummy_inputs(self, seed=0, input_image_type="pd", output_type="np"):
input_image = image.numpy().transpose(0, 2, 3, 1)
input_image = VaeImageProcessor.numpy_to_pil(input_image)
else:
- raise ValueError(
- f"unsupported input_image_type {input_image_type}.")
+ raise ValueError(f"unsupported input_image_type {input_image_type}.")
if output_type not in ["pd", "np", "pil"]:
raise ValueError(f"unsupported output_type {output_type}")
@@ -125,17 +129,19 @@ def test_stable_diffusion_img2img_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.50082374,
- 0.49329656,
- 0.4963757,
- 0.46307105,
- 0.44599247,
- 0.4877512,
- 0.560709,
- 0.56884044,
- 0.5738671,
- ])
+ expected_slice = np.array(
+ [
+ 0.50082374,
+ 0.49329656,
+ 0.4963757,
+ 0.46307105,
+ 0.44599247,
+ 0.4877512,
+ 0.560709,
+ 0.56884044,
+ 0.5738671,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_img2img_negative_prompt(self):
@@ -149,17 +155,19 @@ def test_stable_diffusion_img2img_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.48659712,
- 0.4004616,
- 0.4762491,
- 0.49117112,
- 0.5414775,
- 0.58218545,
- 0.5550886,
- 0.52305603,
- 0.61624044,
- ])
+ expected_slice = np.array(
+ [
+ 0.48659712,
+ 0.4004616,
+ 0.4762491,
+ 0.49117112,
+ 0.5414775,
+ 0.58218545,
+ 0.5550886,
+ 0.52305603,
+ 0.61624044,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_img2img_multiple_init_images(self):
@@ -173,40 +181,45 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
image = sd_pipe(**inputs).images
image_slice = image[-1, -3:, -3:, -1]
assert image.shape == (2, 32, 32, 3)
- expected_slice = np.array([
- 0.49016288,
- 0.23989454,
- 0.4229045,
- 0.56873804,
- 0.467226,
- 0.5793949,
- 0.6967555,
- 0.7027658,
- 0.5809763,
- ])
+ expected_slice = np.array(
+ [
+ 0.49016288,
+ 0.23989454,
+ 0.4229045,
+ 0.56873804,
+ 0.467226,
+ 0.5793949,
+ 0.6967555,
+ 0.7027658,
+ 0.5809763,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_img2img_k_lms(self):
components = self.get_dummy_components()
components["scheduler"] = LMSDiscreteScheduler(
- beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.29999942,
- 0.5206376,
- 0.37915814,
- 0.4033721,
- 0.7630579,
- 0.4642547,
- 0.5823178,
- 0.6936951,
- 0.48969278,
- ])
+ expected_slice = np.array(
+ [
+ 0.29999942,
+ 0.5206376,
+ 0.37915814,
+ 0.4033721,
+ 0.7630579,
+ 0.4642547,
+ 0.5823178,
+ 0.6936951,
+ 0.48969278,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_pt_np_pil_outputs_equivalent(self):
@@ -218,10 +231,8 @@ def test_pt_np_pil_outputs_equivalent(self):
output_np = sd_pipe(**self.get_dummy_inputs(output_type="np"))[0]
output_pil = sd_pipe(**self.get_dummy_inputs(output_type="pil"))[0]
- assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max(
- ) <= 1e-4
- assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max(
- ) <= 1e-4
+ assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
+ assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
def test_image_types_consistent(self):
components = self.get_dummy_components()
@@ -245,9 +256,7 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png"
- )
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
inputs = {
"prompt": "a fantasy landscape, concept art, high resolution",
"image": init_image,
@@ -286,25 +295,26 @@ def get_inputs(self, dtype="float32", seed=0):
# assert mean_diff < 5e-2
def test_stable_diffusion_img2img_default(self):
- pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 768, 3)
- expected_slice = np.array([
- 0.27150,
- 0.14849,
- 0.15605,
- 0.26740,
- 0.16954,
- 0.18204,
- 0.31470,
- 0.26311,
- 0.24525,
- ])
+ expected_slice = np.array(
+ [
+ 0.27150,
+ 0.14849,
+ 0.15605,
+ 0.26740,
+ 0.16954,
+ 0.18204,
+ 0.31470,
+ 0.26311,
+ 0.24525,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
# def test_img2img_safety_checker_works(self):
@@ -322,8 +332,7 @@ def test_stable_diffusion_img2img_default(self):
# assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros
def test_stable_diffusion_img2img_k_lms(self):
- pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -331,22 +340,23 @@ def test_stable_diffusion_img2img_k_lms(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 768, 3)
- expected_slice = np.array([
- 0.04890,
- 0.04862,
- 0.06422,
- 0.04655,
- 0.05108,
- 0.05307,
- 0.05926,
- 0.08759,
- 0.06852,
- ])
+ expected_slice = np.array(
+ [
+ 0.04890,
+ 0.04862,
+ 0.06422,
+ 0.04655,
+ 0.05108,
+ 0.05307,
+ 0.05926,
+ 0.08759,
+ 0.06852,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
def test_stable_diffusion_img2img_ddim(self):
- pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -354,24 +364,25 @@ def test_stable_diffusion_img2img_ddim(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 768, 3)
- expected_slice = np.array([
- 0.06069,
- 0.05703,
- 0.08054,
- 0.05797,
- 0.06286,
- 0.06234,
- 0.08438,
- 0.11151,
- 0.08068,
- ])
+ expected_slice = np.array(
+ [
+ 0.06069,
+ 0.05703,
+ 0.08054,
+ 0.05797,
+ 0.06286,
+ 0.06234,
+ 0.08438,
+ 0.11151,
+ 0.08068,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
def test_stable_diffusion_img2img_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -379,42 +390,45 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 96)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.7650054097175598,
- 0.10256098955869675,
- 0.4976114332675934,
- 3.388350009918213,
- 3.7242040634155273,
- 4.272988796234131,
- 2.4656283855438232,
- 3.483647108078003,
- 1.765011191368103,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.7650054097175598,
+ 0.10256098955869675,
+ 0.4976114332675934,
+ 3.388350009918213,
+ 3.7242040634155273,
+ 4.272988796234131,
+ 2.4656283855438232,
+ 3.483647108078003,
+ 1.765011191368103,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 96)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.7580092549324036,
- 0.10288780182600021,
- 0.4941849708557129,
- 3.3663346767425537,
- 3.7071609497070312,
- 4.25173807144165,
- 2.4461638927459717,
- 3.451681137084961,
- 1.761878490447998,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.7580092549324036,
+ 0.10288780182600021,
+ 0.4941849708557129,
+ 3.3663346767425537,
+ 3.7071609497070312,
+ 4.25173807144165,
+ 2.4461638927459717,
+ 3.451681137084961,
+ 1.761878490447998,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
@@ -423,13 +437,10 @@ def callback_fn(step: int, timestep: int,
assert number_of_steps == 2
def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg"
- )
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg")
init_image = init_image.resize((760, 504))
model_id = "CompVis/stable-diffusion-v1-4"
- pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- model_id, safety_checker=None)
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "A fantasy landscape, trending on artstation"
@@ -440,21 +451,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
strength=0.75,
guidance_scale=7.5,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
image_slice = image[255:258, 383:386, -1]
assert image.shape == (504, 760, 3)
- expected_slice = np.array([
- 0.71240354,
- 0.71053374,
- 0.69922864,
- 0.7139934,
- 0.7106118,
- 0.69451976,
- 0.71982634,
- 0.71717453,
- 0.70306426,
- ])
+ expected_slice = np.array(
+ [
+ 0.71240354,
+ 0.71053374,
+ 0.69922864,
+ 0.7139934,
+ 0.7106118,
+ 0.69451976,
+ 0.71982634,
+ 0.71717453,
+ 0.70306426,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
@@ -468,9 +482,7 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png"
- )
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
inputs = {
"prompt": "a fantasy landscape, concept art, high resolution",
"image": init_image,
@@ -483,59 +495,45 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_img2img_pndm(self):
- sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_img2img_ddim(self):
- sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_img2img_lms(self):
- sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_img2img_dpm(self):
- sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
inputs["num_inference_steps"] = 30
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 70688fa0182a1..0a815f465532b 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -22,22 +22,28 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler,
- LMSDiscreteScheduler, PNDMScheduler,
- StableDiffusionInpaintPipeline, UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import \
- prepare_mask_and_masked_image
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
- slow)
+from ppdiffusers import (
+ AutoencoderKL,
+ DPMSolverMultistepScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionInpaintPipeline,
+ UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+ prepare_mask_and_masked_image,
+)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -52,7 +58,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -61,7 +68,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -72,10 +80,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -90,11 +98,8 @@ def get_dummy_components(self):
def get_dummy_inputs(self, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (64, 64)))
+ init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
generator = paddle.Generator().manual_seed(seed)
inputs = {
@@ -116,17 +121,19 @@ def test_stable_diffusion_inpaint(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.55786943,
- 0.628228,
- 0.49147403,
- 0.3191774,
- 0.39249492,
- 0.46521175,
- 0.29909956,
- 0.21160087,
- 0.42932406,
- ])
+ expected_slice = np.array(
+ [
+ 0.55786943,
+ 0.628228,
+ 0.49147403,
+ 0.3191774,
+ 0.39249492,
+ 0.46521175,
+ 0.29909956,
+ 0.21160087,
+ 0.42932406,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_inpaint_image_tensor(self):
@@ -138,11 +145,11 @@ def test_stable_diffusion_inpaint_image_tensor(self):
out_pil = output.images
inputs = self.get_dummy_inputs()
inputs["image"] = (
- paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1)
- .transpose(perm=[2, 0, 1]).unsqueeze(axis=0))
+ paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1).transpose(perm=[2, 0, 1]).unsqueeze(axis=0)
+ )
inputs["mask_image"] = (
- paddle.to_tensor(np.array(inputs["mask_image"]) / 255)
- .transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0))
+ paddle.to_tensor(np.array(inputs["mask_image"]) / 255).transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0)
+ )
output = sd_pipe(**inputs)
out_tensor = output.images
assert out_pil.shape == (1, 64, 64, 3)
@@ -166,13 +173,10 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
- mask_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+ mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
inputs = {
- "prompt":
- "Face of a yellow cat, high resolution, sitting on a park bench",
+ "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
"image": init_image,
"mask_image": mask_image,
"generator": generator,
@@ -184,53 +188,60 @@ def get_inputs(self, dtype="float32", seed=0):
def test_stable_diffusion_inpaint_ddim(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting", safety_checker=None)
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.05978,
- 0.10983,
- 0.10514,
- 0.07922,
- 0.08483,
- 0.08587,
- 0.05302,
- 0.03218,
- 0.01636,
- ])
+ expected_slice = np.array(
+ [
+ 0.05978,
+ 0.10983,
+ 0.10514,
+ 0.07922,
+ 0.08483,
+ 0.08587,
+ 0.05302,
+ 0.03218,
+ 0.01636,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.0001
def test_stable_diffusion_inpaint_fp16(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"runwayml/stable-diffusion-inpainting",
paddle_dtype=paddle.float16,
- safety_checker=None, )
+ safety_checker=None,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.9921875,
- 0.9477539,
- 0.90234375,
- 0.96484375,
- 0.9189453,
- 0.875,
- 0.9316406,
- 0.9013672,
- 0.875,
- ])
+ expected_slice = np.array(
+ [
+ 0.9921875,
+ 0.9477539,
+ 0.90234375,
+ 0.96484375,
+ 0.9189453,
+ 0.875,
+ 0.9316406,
+ 0.9013672,
+ 0.875,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.05
def test_stable_diffusion_inpaint_pndm(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting", safety_checker=None)
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -238,22 +249,25 @@ def test_stable_diffusion_inpaint_pndm(self):
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.06892,
- 0.06994,
- 0.07905,
- 0.05366,
- 0.04709,
- 0.04890,
- 0.04107,
- 0.05083,
- 0.04180,
- ])
+ expected_slice = np.array(
+ [
+ 0.06892,
+ 0.06994,
+ 0.07905,
+ 0.05366,
+ 0.04709,
+ 0.04890,
+ 0.04107,
+ 0.05083,
+ 0.04180,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.0001
def test_stable_diffusion_inpaint_k_lms(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting", safety_checker=None)
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -261,17 +275,19 @@ def test_stable_diffusion_inpaint_k_lms(self):
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.23513,
- 0.22413,
- 0.29442,
- 0.24243,
- 0.26214,
- 0.30329,
- 0.26431,
- 0.25025,
- 0.25197,
- ])
+ expected_slice = np.array(
+ [
+ 0.23513,
+ 0.22413,
+ 0.29442,
+ 0.24243,
+ 0.26214,
+ 0.30329,
+ 0.26431,
+ 0.25025,
+ 0.25197,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.0001
@@ -285,13 +301,10 @@ def tearDown(self):
def get_inputs(self, dtype="float32", seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
- mask_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+ mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
inputs = {
- "prompt":
- "Face of a yellow cat, high resolution, sitting on a park bench",
+ "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
"image": init_image,
"mask_image": mask_image,
"generator": generator,
@@ -302,52 +315,40 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_inpaint_ddim(self):
- sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting")
+ sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_inpaint_pndm(self):
- sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting")
+ sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_inpaint_lms(self):
- sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting")
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = load_numpy(
- "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy"
- )
+ expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy")
max_diff = np.abs(expected_image - image).max()
assert max_diff < 0.001
def test_inpaint_dpm(self):
- sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
@@ -360,8 +361,7 @@ def test_inpaint_dpm(self):
assert max_diff < 0.001
-class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(
- unittest.TestCase):
+class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
def test_pil_inputs(self):
im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
im = Image.fromarray(im)
@@ -389,8 +389,7 @@ def test_np_inputs(self):
mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
- t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil,
- mask_pil)
+ t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
self.assertTrue((t_mask_np == t_mask_pil).all())
self.assertTrue((t_masked_np == t_masked_pil).all())
@@ -401,7 +400,8 @@ def test_paddle_3D_2D_inputs(self):
mask_np = mask_tensor.numpy()
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -413,7 +413,8 @@ def test_paddle_3D_3D_inputs(self):
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -424,7 +425,8 @@ def test_paddle_4D_2D_inputs(self):
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -435,19 +437,20 @@ def test_paddle_4D_3D_inputs(self):
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_paddle_4D_4D_inputs(self):
im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8")
- mask_tensor = paddle.randint(0, 255,
- (1, 1, 32, 32)).cast("uint8") > 127.5
+ mask_tensor = paddle.randint(0, 255, (1, 1, 32, 32)).cast("uint8") > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0][0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
self.assertTrue((t_mask_tensor == t_mask_np.cast("float64")).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -458,11 +461,9 @@ def test_paddle_batch_4D_3D(self):
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
- nps = [
- prepare_mask_and_masked_image(i, m)
- for i, m in zip(im_nps, mask_nps)
- ]
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
+ nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
t_mask_np = paddle.concat(x=[n[0] for n in nps])
t_masked_np = paddle.concat(x=[n[1] for n in nps])
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -475,11 +476,9 @@ def test_paddle_batch_4D_4D(self):
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
- nps = [
- prepare_mask_and_masked_image(i, m)
- for i, m in zip(im_nps, mask_nps)
- ]
+ im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+ )
+ nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
t_mask_np = paddle.concat(x=[n[0] for n in nps])
t_masked_np = paddle.concat(x=[n[1] for n in nps])
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -487,44 +486,28 @@ def test_paddle_batch_4D_4D(self):
def test_shape_mismatch(self):
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(
- paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64]))
+ prepare_mask_and_masked_image(paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64]))
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(
- paddle.randn(shape=[2, 3, 32, 32]),
- paddle.randn(shape=[4, 64, 64]))
+ prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 64, 64]))
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(
- paddle.randn(shape=[2, 3, 32, 32]),
- paddle.randn(shape=[4, 1, 64, 64]))
+ prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 1, 64, 64]))
def test_type_mismatch(self):
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(
- paddle.rand(shape=[3, 32, 32]),
- paddle.rand(shape=[3, 32, 32]).numpy())
+ prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.rand(shape=[3, 32, 32]).numpy())
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(
- paddle.rand(shape=[3, 32, 32]).numpy(),
- paddle.rand(shape=[3, 32, 32]))
+ prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]).numpy(), paddle.rand(shape=[3, 32, 32]))
def test_channels_first(self):
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(
- paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32]))
+ prepare_mask_and_masked_image(paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32]))
def test_tensor_range(self):
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(
- paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32]))
+ prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32]))
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(
- paddle.ones(shape=[3, 32, 32]) * -2,
- paddle.rand(shape=[32, 32]))
+ prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * -2, paddle.rand(shape=[32, 32]))
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(
- paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2)
+ prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2)
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(
- paddle.rand(shape=[3, 32, 32]),
- paddle.ones(shape=[32, 32]) * -1)
+ prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * -1)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 6866f1a367654..aef0082255467 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -22,13 +22,23 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- DPMSolverMultistepScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionInpaintPipelineLegacy,
- UNet2DConditionModel, UNet2DModel, VQModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionInpaintPipelineLegacy,
+ UNet2DConditionModel,
+ UNet2DModel,
+ VQModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
-from ppdiffusers.utils.testing_utils import (load_numpy, preprocess_image,
- require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import (
+ load_numpy,
+ preprocess_image,
+ require_paddle_gpu,
+)
class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
@@ -42,8 +52,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -56,7 +65,8 @@ def dummy_uncond_unet(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
@property
@@ -70,7 +80,8 @@ def dummy_cond_unet(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -84,7 +95,8 @@ def dummy_cond_unet_inpaint(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -96,7 +108,8 @@ def dummy_vq_model(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=3, )
+ latent_channels=3,
+ )
return model
@property
@@ -108,7 +121,8 @@ def dummy_vae(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
return model
@property
@@ -123,7 +137,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
@property
@@ -146,13 +161,10 @@ def test_stable_diffusion_inpaint_legacy(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB")
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (32, 32)))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
sd_pipe = StableDiffusionInpaintPipelineLegacy(
unet=unet,
scheduler=scheduler,
@@ -160,7 +172,8 @@ def test_stable_diffusion_inpaint_legacy(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -171,7 +184,8 @@ def test_stable_diffusion_inpaint_legacy(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- mask_image=mask_image, )
+ mask_image=mask_image,
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -182,32 +196,33 @@ def test_stable_diffusion_inpaint_legacy(self):
output_type="np",
image=init_image,
mask_image=mask_image,
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.01514593,
- 0.46352747,
- 0.34991893,
- 0.29177475,
- 0.5415823,
- 0.56992227,
- 0.39533705,
- 0.67953515,
- 0.5445507,
- ])
+ expected_slice = np.array(
+ [
+ 0.01514593,
+ 0.46352747,
+ 0.34991893,
+ 0.29177475,
+ 0.5415823,
+ 0.56992227,
+ 0.39533705,
+ 0.67953515,
+ 0.5445507,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_inpaint_legacy_batched(self):
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.permute(0, 2, 3, 1)[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB")
@@ -222,7 +237,8 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
@@ -234,35 +250,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
num_inference_steps=2,
output_type="np",
image=init_images_tens,
- mask_image=init_masks_tens, ).images
+ mask_image=init_masks_tens,
+ ).images
assert images.shape == (2, 32, 32, 3)
image_slice_0 = images[0, -3:, -3:, -1].flatten()
image_slice_1 = images[1, -3:, -3:, -1].flatten()
- expected_slice_0 = np.array([
- 0.50299895,
- 0.6465979,
- 0.3489662,
- 0.28862774,
- 0.59657216,
- 0.41669005,
- 0.19621253,
- 0.27549136,
- 0.39040852,
- ])
- expected_slice_1 = np.array([
- 0.70079666,
- 0.5616544,
- 0.5304112,
- 0.38820785,
- 0.3118701,
- 0.47477302,
- 0.37215403,
- 0.3785481,
- 0.50153226,
- ])
+ expected_slice_0 = np.array(
+ [
+ 0.50299895,
+ 0.6465979,
+ 0.3489662,
+ 0.28862774,
+ 0.59657216,
+ 0.41669005,
+ 0.19621253,
+ 0.27549136,
+ 0.39040852,
+ ]
+ )
+ expected_slice_1 = np.array(
+ [
+ 0.70079666,
+ 0.5616544,
+ 0.5304112,
+ 0.38820785,
+ 0.3118701,
+ 0.47477302,
+ 0.37215403,
+ 0.3785481,
+ 0.50153226,
+ ]
+ )
assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2
assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2
@@ -272,13 +293,10 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB")
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (32, 32)))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
sd_pipe = StableDiffusionInpaintPipelineLegacy(
unet=unet,
scheduler=scheduler,
@@ -286,7 +304,8 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
negative_prompt = "french fries"
@@ -299,21 +318,24 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- mask_image=mask_image, )
+ mask_image=mask_image,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.0,
- 0.43941003,
- 0.32130337,
- 0.31442684,
- 0.566114,
- 0.56392324,
- 0.3946159,
- 0.6844422,
- 0.5345681,
- ])
+ expected_slice = np.array(
+ [
+ 0.0,
+ 0.43941003,
+ 0.32130337,
+ 0.31442684,
+ 0.566114,
+ 0.56392324,
+ 0.3946159,
+ 0.6844422,
+ 0.5345681,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
@@ -321,13 +343,10 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB")
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (32, 32)))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
sd_pipe = StableDiffusionInpaintPipelineLegacy(
unet=unet,
scheduler=scheduler,
@@ -335,7 +354,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
images = sd_pipe(
@@ -343,7 +363,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- mask_image=mask_image, ).images
+ mask_image=mask_image,
+ ).images
assert images.shape == (1, 32, 32, 3)
batch_size = 2
images = sd_pipe(
@@ -351,7 +372,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- mask_image=mask_image, ).images
+ mask_image=mask_image,
+ ).images
assert images.shape == (batch_size, 32, 32, 3)
num_images_per_prompt = 2
images = sd_pipe(
@@ -360,7 +382,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
output_type="np",
image=init_image,
mask_image=mask_image,
- num_images_per_prompt=num_images_per_prompt, ).images
+ num_images_per_prompt=num_images_per_prompt,
+ ).images
assert images.shape == (num_images_per_prompt, 32, 32, 3)
batch_size = 2
images = sd_pipe(
@@ -369,7 +392,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
output_type="np",
image=init_image,
mask_image=mask_image,
- num_images_per_prompt=num_images_per_prompt, ).images
+ num_images_per_prompt=num_images_per_prompt,
+ ).images
assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
@@ -383,10 +407,8 @@ def tearDown(self):
def get_inputs(self, seed=0):
generator = paddle.Generator().manual_seed(seed)
- init_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
- mask_image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+ init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+ mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
inputs = {
"prompt": "A red cat sitting on a park bench",
"image": init_image,
@@ -401,29 +423,33 @@ def get_inputs(self, seed=0):
def test_stable_diffusion_inpaint_legacy_pndm(self):
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.27226633,
- 0.29068208,
- 0.3450312,
- 0.21444553,
- 0.26328486,
- 0.34392387,
- 0.18026042,
- 0.24961185,
- 0.3214044,
- ])
+ expected_slice = np.array(
+ [
+ 0.27226633,
+ 0.29068208,
+ 0.3450312,
+ 0.21444553,
+ 0.26328486,
+ 0.34392387,
+ 0.18026042,
+ 0.24961185,
+ 0.3214044,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.0001
def test_stable_diffusion_inpaint_legacy_batched(self):
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -443,35 +469,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
image_slice_0 = image[0, 253:256, 253:256, -1].flatten()
image_slice_1 = image[1, 253:256, 253:256, -1].flatten()
- expected_slice_0 = np.array([
- 0.27526367,
- 0.29158682,
- 0.35184938,
- 0.21504477,
- 0.26708275,
- 0.35169,
- 0.18185198,
- 0.2572803,
- 0.32425082,
- ])
- expected_slice_1 = np.array([
- 0.0,
- 0.18929192,
- 0.7068148,
- 0.07977328,
- 0.13444492,
- 0.5016247,
- 0.49761847,
- 0.2830933,
- 0.36412603,
- ])
+ expected_slice_0 = np.array(
+ [
+ 0.27526367,
+ 0.29158682,
+ 0.35184938,
+ 0.21504477,
+ 0.26708275,
+ 0.35169,
+ 0.18185198,
+ 0.2572803,
+ 0.32425082,
+ ]
+ )
+ expected_slice_1 = np.array(
+ [
+ 0.0,
+ 0.18929192,
+ 0.7068148,
+ 0.07977328,
+ 0.13444492,
+ 0.5016247,
+ 0.49761847,
+ 0.2830933,
+ 0.36412603,
+ ]
+ )
assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4
assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4
def test_stable_diffusion_inpaint_legacy_k_lms(self):
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "CompVis/stable-diffusion-v1-4", safety_checker=None)
+ "CompVis/stable-diffusion-v1-4", safety_checker=None
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -479,24 +510,25 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self):
image = pipe(**inputs).images
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.29036117,
- 0.28907132,
- 0.32839334,
- 0.26510137,
- 0.2820784,
- 0.31148806,
- 0.29358387,
- 0.29515788,
- 0.28257304,
- ])
+ expected_slice = np.array(
+ [
+ 0.29036117,
+ 0.28907132,
+ 0.32839334,
+ 0.26510137,
+ 0.2820784,
+ 0.31148806,
+ 0.29358387,
+ 0.29515788,
+ 0.28257304,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.0001
def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -504,42 +536,45 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.103,
- 1.415,
- -0.02197,
- -0.5103,
- -0.5903,
- 0.1953,
- 0.75,
- 0.3477,
- -1.356,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.001
+ expected_slice = np.array(
+ [
+ -0.103,
+ 1.415,
+ -0.02197,
+ -0.5103,
+ -0.5903,
+ 0.1953,
+ 0.75,
+ 0.3477,
+ -1.356,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.4802,
- 1.154,
- 0.628,
- 0.2322,
- 0.2593,
- -0.1455,
- 0.7075,
- -0.1617,
- -0.5615,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.001
+ expected_slice = np.array(
+ [
+ 0.4802,
+ 1.154,
+ 0.628,
+ 0.2322,
+ 0.2593,
+ -0.1455,
+ 0.7075,
+ -0.1617,
+ -0.5615,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
callback_fn.has_been_called = False
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
@@ -577,20 +612,17 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_inpaint_pndm(self):
- sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([[0.7330009, 0.80003107, 0.8268216],
- [0.73606366, 0.801595, 0.8470554]])
+ expected_image = np.array([[0.7330009, 0.80003107, 0.8268216], [0.73606366, 0.801595, 0.8470554]])
max_diff = np.abs(expected_image - image[0][0:2]).max()
assert max_diff < 0.001
def test_inpaint_ddim(self):
- sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
@@ -599,36 +631,29 @@ def test_inpaint_ddim(self):
expected_image = load_numpy(
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy"
)
- expected_image = np.array([[0.7290994, 0.794852, 0.82096446],
- [0.7330909, 0.79727536, 0.8420528]])
+ expected_image = np.array([[0.7290994, 0.794852, 0.82096446], [0.7330909, 0.79727536, 0.8420528]])
max_diff = np.abs(expected_image - image[0][0:2]).max()
assert max_diff < 0.001
def test_inpaint_lms(self):
- sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([[0.74595624, 0.81757987, 0.84589916],
- [0.74728143, 0.81736475, 0.86543]])
+ expected_image = np.array([[0.74595624, 0.81757987, 0.84589916], [0.74728143, 0.81736475, 0.86543]])
max_diff = np.abs(expected_image - image[0][0:2]).max()
assert max_diff < 0.001
def test_inpaint_dpm(self):
- sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
inputs["num_inference_steps"] = 30
image = sd_pipe(**inputs).images[0]
- expected_image = np.array([[0.7310472, 0.7970823, 0.8231524],
- [0.7348697, 0.799358, 0.8439586]])
+ expected_image = np.array([[0.7310472, 0.7970823, 0.8231524], [0.7348697, 0.799358, 0.8439586]])
max_diff = np.abs(expected_image - image[0][0:2]).max()
assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index c367a6f472e51..0a6d49df4418f 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -22,20 +22,26 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionInstructPix2PixPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionInstructPix2PixPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInstructPix2PixPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
"height",
@@ -54,7 +60,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -63,7 +70,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -74,10 +82,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -114,17 +122,19 @@ def test_stable_diffusion_pix2pix_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.24897021,
- 0.3813318,
- 0.15630311,
- 0.69198483,
- 0.7409521,
- 0.55128014,
- 0.5978868,
- 0.60921687,
- 0.47007012,
- ])
+ expected_slice = np.array(
+ [
+ 0.24897021,
+ 0.3813318,
+ 0.15630311,
+ 0.69198483,
+ 0.7409521,
+ 0.55128014,
+ 0.5978868,
+ 0.60921687,
+ 0.47007012,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_pix2pix_negative_prompt(self):
@@ -137,17 +147,19 @@ def test_stable_diffusion_pix2pix_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.27121854,
- 0.34936333,
- 0.12865198,
- 0.77894104,
- 0.81688535,
- 0.6136005,
- 0.62261313,
- 0.6386795,
- 0.5096967,
- ])
+ expected_slice = np.array(
+ [
+ 0.27121854,
+ 0.34936333,
+ 0.12865198,
+ 0.77894104,
+ 0.81688535,
+ 0.6136005,
+ 0.62261313,
+ 0.6386795,
+ 0.5096967,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_pix2pix_multiple_init_images(self):
@@ -164,23 +176,26 @@ def test_stable_diffusion_pix2pix_multiple_init_images(self):
image_slice = image[-1, -3:, -3:, -1]
assert image.shape == (2, 32, 32, 3)
- expected_slice = np.array([
- 0.41508308,
- 0.41580454,
- 0.5588631,
- 0.32340443,
- 0.20930073,
- 0.35993075,
- 0.28470254,
- 0.38203996,
- 0.51769114,
- ])
+ expected_slice = np.array(
+ [
+ 0.41508308,
+ 0.41580454,
+ 0.5588631,
+ 0.32340443,
+ 0.20930073,
+ 0.35993075,
+ 0.28470254,
+ 0.38203996,
+ 0.51769114,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_pix2pix_euler(self):
components = self.get_dummy_components()
components["scheduler"] = EulerAncestralDiscreteScheduler(
- beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
@@ -189,17 +204,19 @@ def test_stable_diffusion_pix2pix_euler(self):
slice = [round(x, 4) for x in image_slice.flatten().tolist()]
print(",".join([str(x) for x in slice]))
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.26694882,
- 0.4288544,
- 0.21950376,
- 0.74369204,
- 0.6756442,
- 0.54577595,
- 0.5941435,
- 0.5603916,
- 0.51743454,
- ])
+ expected_slice = np.array(
+ [
+ 0.26694882,
+ 0.4288544,
+ 0.21950376,
+ 0.74369204,
+ 0.6756442,
+ 0.54577595,
+ 0.5941435,
+ 0.5603916,
+ 0.51743454,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
@@ -213,8 +230,7 @@ def tearDown(self):
def get_inputs(self, seed=0):
generator = paddle.Generator().manual_seed(seed=seed)
- image = load_image(
- "https://paddlenlp.bj.bcebos.com/data/images/example.jpg")
+ image = load_image("https://paddlenlp.bj.bcebos.com/data/images/example.jpg")
inputs = {
"prompt": "turn him into a cyborg",
"image": image,
@@ -228,29 +244,33 @@ def get_inputs(self, seed=0):
def test_stable_diffusion_pix2pix_default(self):
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
- "timbrooks/instruct-pix2pix", safety_checker=None)
+ "timbrooks/instruct-pix2pix", safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.32138163,
- 0.32519442,
- 0.33127248,
- 0.32613453,
- 0.33317798,
- 0.33505,
- 0.32397628,
- 0.32964426,
- 0.32055843,
- ])
+ expected_slice = np.array(
+ [
+ 0.32138163,
+ 0.32519442,
+ 0.33127248,
+ 0.32613453,
+ 0.33317798,
+ 0.33505,
+ 0.32397628,
+ 0.32964426,
+ 0.32055843,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
def test_stable_diffusion_pix2pix_k_lms(self):
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
- "timbrooks/instruct-pix2pix", safety_checker=None)
+ "timbrooks/instruct-pix2pix", safety_checker=None
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -258,22 +278,25 @@ def test_stable_diffusion_pix2pix_k_lms(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.38934484,
- 0.3929934,
- 0.39973113,
- 0.4196028,
- 0.42386433,
- 0.43073824,
- 0.4267708,
- 0.43173674,
- 0.41896266,
- ])
+ expected_slice = np.array(
+ [
+ 0.38934484,
+ 0.3929934,
+ 0.39973113,
+ 0.4196028,
+ 0.42386433,
+ 0.43073824,
+ 0.4267708,
+ 0.43173674,
+ 0.41896266,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
def test_stable_diffusion_pix2pix_ddim(self):
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
- "timbrooks/instruct-pix2pix", safety_checker=None)
+ "timbrooks/instruct-pix2pix", safety_checker=None
+ )
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -281,24 +304,25 @@ def test_stable_diffusion_pix2pix_ddim(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.51511174,
- 0.5185677,
- 0.51326,
- 0.5176025,
- 0.514665,
- 0.519833,
- 0.52196854,
- 0.5121842,
- 0.52435803,
- ])
+ expected_slice = np.array(
+ [
+ 0.51511174,
+ 0.5185677,
+ 0.51326,
+ 0.5176025,
+ 0.514665,
+ 0.519833,
+ 0.52196854,
+ 0.5121842,
+ 0.52435803,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.001
def test_stable_diffusion_pix2pix_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -306,28 +330,21 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556,
- 1.227
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array([-0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556, 1.227])
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537,
- 1.239
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array([-0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537, 1.239])
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
"timbrooks/instruct-pix2pix",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
@@ -339,23 +356,24 @@ def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
inputs = self.get_inputs()
inputs["image"] = inputs["image"].resize((504, 504))
model_id = "timbrooks/instruct-pix2pix"
- pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
- model_id, safety_checker=None)
+ pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
output = pipe(**inputs)
image = output.images[0]
image_slice = image[255:258, 383:386, -1]
assert image.shape == (504, 504, 3)
- expected_slice = np.array([
- 0.183373,
- 0.20458564,
- 0.2428664,
- 0.18245864,
- 0.22010538,
- 0.25757712,
- 0.19680199,
- 0.2185145,
- 0.24869373,
- ])
+ expected_slice = np.array(
+ [
+ 0.183373,
+ 0.20458564,
+ 0.2428664,
+ 0.18245864,
+ 0.22010538,
+ 0.25757712,
+ 0.19680199,
+ 0.2185145,
+ 0.24869373,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 176a0629de209..9f4ef2ff6f041 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -20,10 +20,15 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionPanoramaPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPanoramaPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -31,8 +36,7 @@
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPanoramaPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -47,7 +51,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler()
paddle.seed(0)
vae = AutoencoderKL(
@@ -56,7 +61,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -67,10 +73,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -103,17 +109,19 @@ def test_stable_diffusion_panorama_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.28862977,
- 0.2441951,
- 0.2683525,
- 0.33122095,
- 0.28755113,
- 0.46375293,
- 0.254181,
- 0.30616608,
- 0.4785265,
- ])
+ expected_slice = np.array(
+ [
+ 0.28862977,
+ 0.2441951,
+ 0.2683525,
+ 0.33122095,
+ 0.28755113,
+ 0.46375293,
+ 0.254181,
+ 0.30616608,
+ 0.4785265,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
# override to speed the overall test timing up.
@@ -134,40 +142,45 @@ def test_stable_diffusion_panorama_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.28995812,
- 0.24463832,
- 0.2682391,
- 0.33033937,
- 0.2868188,
- 0.46267676,
- 0.25425047,
- 0.3066897,
- 0.47881347,
- ])
+ expected_slice = np.array(
+ [
+ 0.28995812,
+ 0.24463832,
+ 0.2682391,
+ 0.33033937,
+ 0.2868188,
+ 0.46267676,
+ 0.25425047,
+ 0.3066897,
+ 0.47881347,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_panorama_euler(self):
components = self.get_dummy_components()
components["scheduler"] = EulerAncestralDiscreteScheduler(
- beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.32409406,
- 0.2660764,
- 0.41739762,
- 0.18994612,
- 0.32522476,
- 0.4869789,
- 0.13573006,
- 0.14128971,
- 0.32650158,
- ])
+ expected_slice = np.array(
+ [
+ 0.32409406,
+ 0.2660764,
+ 0.41739762,
+ 0.18994612,
+ 0.32522476,
+ 0.4869789,
+ 0.13573006,
+ 0.14128971,
+ 0.32650158,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_panorama_pndm(self):
@@ -201,32 +214,33 @@ def get_inputs(self, seed=0):
def test_stable_diffusion_panorama_default(self):
model_ckpt = "stabilityai/stable-diffusion-2-base"
- scheduler = DDIMScheduler.from_pretrained(
- model_ckpt, subfolder="scheduler")
- pipe = StableDiffusionPanoramaPipeline.from_pretrained(
- model_ckpt, scheduler=scheduler, safety_checker=None)
+ scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+ pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 2048, 3)
- expected_slice = np.array([
- 0.34261876,
- 0.3045774,
- 0.34545267,
- 0.33774284,
- 0.3431282,
- 0.33453488,
- 0.3094663,
- 0.32646674,
- 0.32534528,
- ])
+ expected_slice = np.array(
+ [
+ 0.34261876,
+ 0.3045774,
+ 0.34545267,
+ 0.33774284,
+ 0.3431282,
+ 0.33453488,
+ 0.3094663,
+ 0.32646674,
+ 0.32534528,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.01
def test_stable_diffusion_panorama_k_lms(self):
pipe = StableDiffusionPanoramaPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base", safety_checker=None)
+ "stabilityai/stable-diffusion-2-base", safety_checker=None
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -234,24 +248,25 @@ def test_stable_diffusion_panorama_k_lms(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 2048, 3)
- expected_slice = np.array([
- 0.0,
- 0.01188838,
- 0.02675471,
- 0.00534895,
- 0.02325496,
- 0.01234779,
- 0.0348064,
- 0.0,
- 0.02607787,
- ])
+ expected_slice = np.array(
+ [
+ 0.0,
+ 0.01188838,
+ 0.02675471,
+ 0.00534895,
+ 0.02325496,
+ 0.01234779,
+ 0.0348064,
+ 0.0,
+ 0.02607787,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.01
def test_stable_diffusion_panorama_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -259,43 +274,43 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 256)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.7392851114273071,
- -0.16683124005794525,
- 0.2063215672969818,
- -0.09840865433216095,
- 0.18722617626190186,
- -0.08375956118106842,
- 0.06995373964309692,
- -0.20892930030822754,
- -0.157355397939682,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.7392851114273071,
+ -0.16683124005794525,
+ 0.2063215672969818,
+ -0.09840865433216095,
+ 0.18722617626190186,
+ -0.08375956118106842,
+ 0.06995373964309692,
+ -0.20892930030822754,
+ -0.157355397939682,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 256)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.7368452548980713,
- -0.16317462921142578,
- 0.20289096236228943,
- -0.10271137207746506,
- 0.1873130351305008,
- -0.08454630523920059,
- 0.06944799423217773,
- -0.20782311260700226,
- -0.15696658194065094,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.7368452548980713,
+ -0.16317462921142578,
+ 0.20289096236228943,
+ -0.10271137207746506,
+ 0.1873130351305008,
+ -0.08454630523920059,
+ 0.06944799423217773,
+ -0.20782311260700226,
+ -0.15696658194065094,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
model_ckpt = "stabilityai/stable-diffusion-2-base"
- scheduler = DDIMScheduler.from_pretrained(
- model_ckpt, subfolder="scheduler")
- pipe = StableDiffusionPanoramaPipeline.from_pretrained(
- model_ckpt, scheduler=scheduler, safety_checker=None)
+ scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+ pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 0bee318686efc..d4787ab8eaa4d 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -21,14 +21,22 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, DDIMInverseScheduler, DDIMScheduler, DDPMScheduler,
- EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
- StableDiffusionPix2PixZeroPipeline, UNet2DConditionModel)
+ AutoencoderKL,
+ DDIMInverseScheduler,
+ DDIMScheduler,
+ DDPMScheduler,
+ EulerAncestralDiscreteScheduler,
+ LMSDiscreteScheduler,
+ StableDiffusionPix2PixZeroPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import load_image, slow
from ppdiffusers.utils.testing_utils import load_pt, require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
@@ -39,8 +47,7 @@ def to_paddle(x):
# we use SGD optimizer in this pipeline, so the result is not stable!
-class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPix2PixZeroPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
@@ -51,12 +58,14 @@ def setUpClass(cls):
cls.source_embeds = to_paddle(
load_pt(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
- ))
+ )
+ )
cls.target_embeds = to_paddle(
load_pt(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
- ))
+ )
+ )
def get_dummy_components(self):
paddle.seed(0)
@@ -68,7 +77,8 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler()
paddle.seed(0)
vae = AutoencoderKL(
@@ -77,7 +87,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -88,10 +99,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -128,17 +139,19 @@ def test_stable_diffusion_pix2pix_zero_default_case(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.58762765,
- 0.17410329,
- 0.5067884,
- 0.39995563,
- 0.02808204,
- 0.35726422,
- 0.3250693,
- 0.3155224,
- 0.5268162,
- ])
+ expected_slice = np.array(
+ [
+ 0.58762765,
+ 0.17410329,
+ 0.5067884,
+ 0.39995563,
+ 0.02808204,
+ 0.35726422,
+ 0.3250693,
+ 0.3155224,
+ 0.5268162,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
@@ -151,40 +164,45 @@ def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.5042143,
- 0.34658563,
- 0.56157184,
- 0.3707891,
- 0.23746812,
- 0.47898933,
- 0.2702424,
- 0.36307925,
- 0.50807047,
- ])
+ expected_slice = np.array(
+ [
+ 0.5042143,
+ 0.34658563,
+ 0.56157184,
+ 0.3707891,
+ 0.23746812,
+ 0.47898933,
+ 0.2702424,
+ 0.36307925,
+ 0.50807047,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_euler(self):
components = self.get_dummy_components()
components["scheduler"] = EulerAncestralDiscreteScheduler(
- beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.4870367,
- 0.2677226,
- 0.37830275,
- 0.63265973,
- 0.32151344,
- 0.406371,
- 0.67513967,
- 0.5246535,
- 0.55954224,
- ])
+ expected_slice = np.array(
+ [
+ 0.4870367,
+ 0.2677226,
+ 0.37830275,
+ 0.63265973,
+ 0.32151344,
+ 0.406371,
+ 0.67513967,
+ 0.5246535,
+ 0.55954224,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_ddpm(self):
@@ -196,17 +214,19 @@ def test_stable_diffusion_pix2pix_zero_ddpm(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.5899046,
- 0.17750263,
- 0.50616807,
- 0.39558932,
- 0.02976257,
- 0.35918522,
- 0.32376733,
- 0.31742626,
- 0.52768075,
- ])
+ expected_slice = np.array(
+ [
+ 0.5899046,
+ 0.17750263,
+ 0.50616807,
+ 0.39558932,
+ 0.02976257,
+ 0.35918522,
+ 0.32376733,
+ 0.31742626,
+ 0.52768075,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self):
@@ -218,14 +238,12 @@ def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self):
assert images.shape == (1, 64, 64, 3)
num_images_per_prompt = 2
inputs = self.get_dummy_inputs()
- images = sd_pipe(
- **inputs, num_images_per_prompt=num_images_per_prompt).images
+ images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
assert images.shape == (num_images_per_prompt, 64, 64, 3)
batch_size = 2
inputs = self.get_dummy_inputs()
inputs["prompt"] = [inputs["prompt"]] * batch_size
- images = sd_pipe(
- **inputs, num_images_per_prompt=num_images_per_prompt).images
+ images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
# Non-determinism caused by the scheduler optimizing the latent inputs during inference
@@ -245,14 +263,12 @@ def tearDown(self):
@classmethod
def setUpClass(cls):
cls.source_embeds = to_paddle(
- load_pt(
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt"
- ))
+ load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt")
+ )
cls.target_embeds = to_paddle(
- load_pt(
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt"
- ))
+ load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt")
+ )
def get_inputs(self, seed=0):
generator = paddle.Generator().manual_seed(seed=seed)
@@ -272,46 +288,48 @@ def test_stable_diffusion_pix2pix_zero_default(self):
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.8129883,
- 0.81933594,
- 0.80371094,
- 0.8105469,
- 0.8076172,
- 0.80566406,
- 0.81884766,
- 0.8330078,
- 0.82470703,
- ])
+ expected_slice = np.array(
+ [
+ 0.8129883,
+ 0.81933594,
+ 0.80371094,
+ 0.8105469,
+ 0.8076172,
+ 0.80566406,
+ 0.81884766,
+ 0.8330078,
+ 0.82470703,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_k_lms(self):
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array(
- [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711])
+ expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711])
assert np.abs(expected_slice - image_slice).max() < 0.05
def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -319,42 +337,45 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.93444633,
- 1.1613252,
- 0.7700033,
- 0.18847837,
- -1.17147,
- 0.07546477,
- 0.06142269,
- -0.8030814,
- -0.59692276,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.93444633,
+ 1.1613252,
+ 0.7700033,
+ 0.18847837,
+ -1.17147,
+ 0.07546477,
+ 0.06142269,
+ -0.8030814,
+ -0.59692276,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.93180454,
- 1.1606954,
- 0.7721853,
- 0.18454231,
- -1.1679069,
- 0.07357024,
- 0.06213593,
- -0.80399096,
- -0.5937987,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.93180454,
+ 1.1606954,
+ 0.7721853,
+ 0.18454231,
+ -1.1679069,
+ 0.07357024,
+ 0.06213593,
+ -0.80399096,
+ -0.5937987,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
@@ -385,38 +406,29 @@ def test_stable_diffusion_pix2pix_inversion(self):
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
- pipe.inverse_scheduler = DDIMScheduler.from_config(
- pipe.scheduler.config)
- pipe.inverse_scheduler = DDIMInverseScheduler.from_config(
- pipe.scheduler.config)
+ paddle_dtype=paddle.float16,
+ )
+ pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+ pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
caption = "a photography of a cat with flowers"
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- output = pipe.invert(
- caption,
- image=self.raw_image,
- generator=generator,
- num_inference_steps=10)
+ output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
inv_latents = output[0]
image_slice = inv_latents[0, -3:, -3:, -1].flatten()
assert tuple(inv_latents.shape) == (1, 4, 64, 64)
- expected_slice = np.array([
- 0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498,
- -0.8599
- ])
+ expected_slice = np.array([0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, -0.8599])
assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 0.05
def test_stable_diffusion_pix2pix_full(self):
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
- pipe.inverse_scheduler = DDIMScheduler.from_config(
- pipe.scheduler.config)
- pipe.inverse_scheduler = DDIMInverseScheduler.from_config(
- pipe.scheduler.config)
+ paddle_dtype=paddle.float16,
+ )
+ pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+ pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
caption = "a photography of a cat with flowers"
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
@@ -444,19 +456,22 @@ def test_stable_diffusion_pix2pix_full(self):
generator=generator,
latents=inv_latents,
negative_prompt=caption,
- output_type="np", ).images
+ output_type="np",
+ ).images
image_slice = image[0, -3:, -3:, -1].flatten()
- expected_slice = np.array([
- 0.64208984375,
- 0.65673828125,
- 0.650390625,
- 0.6513671875,
- 0.646484375,
- 0.6650390625,
- 0.6513671875,
- 0.6640625,
- 0.66796875,
- ])
+ expected_slice = np.array(
+ [
+ 0.64208984375,
+ 0.65673828125,
+ 0.650390625,
+ 0.6513671875,
+ 0.646484375,
+ 0.6650390625,
+ 0.6513671875,
+ 0.6640625,
+ 0.66796875,
+ ]
+ )
max_diff = np.abs(image_slice - expected_slice).max()
assert max_diff < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index d04d08d9bb18f..aa60def2d023c 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -20,8 +20,12 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- StableDiffusionSAGPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ StableDiffusionSAGPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -29,8 +33,7 @@
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionSAGPipeline
test_cpu_offload = False
params = TEXT_TO_IMAGE_PARAMS
@@ -46,13 +49,15 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -60,7 +65,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -71,10 +77,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -109,8 +115,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_stable_diffusion_1(self):
- sag_pipe = StableDiffusionSAGPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4")
+ sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
sag_pipe.set_progress_bar_config(disable=None)
prompt = "."
generator = paddle.Generator().manual_seed(0)
@@ -120,26 +125,28 @@ def test_stable_diffusion_1(self):
guidance_scale=7.5,
sag_scale=1.0,
num_inference_steps=20,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.7477613,
- 0.76045597,
- 0.7464366,
- 0.778965,
- 0.75718963,
- 0.7487634,
- 0.77530396,
- 0.77426934,
- 0.7749926,
- ])
+ expected_slice = np.array(
+ [
+ 0.7477613,
+ 0.76045597,
+ 0.7464366,
+ 0.778965,
+ 0.75718963,
+ 0.7487634,
+ 0.77530396,
+ 0.77426934,
+ 0.7749926,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_2(self):
- sag_pipe = StableDiffusionSAGPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
+ sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
sag_pipe.set_progress_bar_config(disable=None)
prompt = "."
generator = paddle.Generator().manual_seed(0)
@@ -149,19 +156,22 @@ def test_stable_diffusion_2(self):
guidance_scale=7.5,
sag_scale=1.0,
num_inference_steps=20,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.8771595,
- 0.8521123,
- 0.8644101,
- 0.8680052,
- 0.8700466,
- 0.8897612,
- 0.87766427,
- 0.8636212,
- 0.86829203,
- ])
+ expected_slice = np.array(
+ [
+ 0.8771595,
+ 0.8521123,
+ 0.8644101,
+ 0.8680052,
+ 0.8700466,
+ 0.8897612,
+ 0.87766427,
+ 0.8636212,
+ 0.86829203,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 07d1870d2afd5..1e95848760207 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -21,10 +21,17 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ppdiffusers import (
- AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline,
- UNet2DConditionModel, logging)
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+ logging,
+)
from ppdiffusers.utils import load_numpy, nightly, slow
from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
@@ -49,13 +56,15 @@ def get_dummy_components(self):
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_head_dim=(2, 4),
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -64,7 +73,8 @@ def get_dummy_components(self):
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
- sample_size=128, )
+ sample_size=128,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -77,10 +87,10 @@ def get_dummy_components(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -112,17 +122,19 @@ def test_stable_diffusion_ddim(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.3505131,
- 0.36318004,
- 0.39201266,
- 0.12107915,
- 0.27704653,
- 0.40363187,
- 0.09379572,
- 0.16225743,
- 0.36048344,
- ])
+ expected_slice = np.array(
+ [
+ 0.3505131,
+ 0.36318004,
+ 0.39201266,
+ 0.12107915,
+ 0.27704653,
+ 0.40363187,
+ 0.09379572,
+ 0.16225743,
+ 0.36048344,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_pndm(self):
@@ -134,122 +146,127 @@ def test_stable_diffusion_pndm(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.25144678,
- 0.35438284,
- 0.3613463,
- 0.11020249,
- 0.3101831,
- 0.42739886,
- 0.1142821,
- 0.17371863,
- 0.35148838,
- ])
+ expected_slice = np.array(
+ [
+ 0.25144678,
+ 0.35438284,
+ 0.3613463,
+ 0.11020249,
+ 0.3101831,
+ 0.42739886,
+ 0.1142821,
+ 0.17371863,
+ 0.35148838,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_k_lms(self):
components = self.get_dummy_components()
- components["scheduler"] = LMSDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.3676631,
- 0.38155898,
- 0.4023114,
- 0.11294425,
- 0.2891888,
- 0.40432304,
- 0.08882684,
- 0.1466648,
- 0.33633134,
- ])
+ expected_slice = np.array(
+ [
+ 0.3676631,
+ 0.38155898,
+ 0.4023114,
+ 0.11294425,
+ 0.2891888,
+ 0.40432304,
+ 0.08882684,
+ 0.1466648,
+ 0.33633134,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_k_euler_ancestral(self):
components = self.get_dummy_components()
- components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(
- components["scheduler"].config)
+ components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.36797395,
- 0.38137895,
- 0.40199342,
- 0.11330777,
- 0.2886864,
- 0.40422022,
- 0.08929691,
- 0.14658183,
- 0.3363046,
- ])
+ expected_slice = np.array(
+ [
+ 0.36797395,
+ 0.38137895,
+ 0.40199342,
+ 0.11330777,
+ 0.2886864,
+ 0.40422022,
+ 0.08929691,
+ 0.14658183,
+ 0.3363046,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_k_euler(self):
components = self.get_dummy_components()
- components["scheduler"] = EulerDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.36766386,
- 0.3815591,
- 0.40231153,
- 0.11294428,
- 0.28918856,
- 0.40432304,
- 0.08882678,
- 0.14666462,
- 0.3363313,
- ])
+ expected_slice = np.array(
+ [
+ 0.36766386,
+ 0.3815591,
+ 0.40231153,
+ 0.11294428,
+ 0.28918856,
+ 0.40432304,
+ 0.08882678,
+ 0.14666462,
+ 0.3363313,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_long_prompt(self):
components = self.get_dummy_components()
- components["scheduler"] = LMSDiscreteScheduler.from_config(components[
- "scheduler"].config)
+ components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
sd_pipe = StableDiffusionPipeline(**components)
sd_pipe.set_progress_bar_config(disable=None)
do_classifier_free_guidance = True
negative_prompt = None
num_images_per_prompt = 1
- logger = logging.get_logger(
- "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+ logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
prompt = 25 * "@"
with CaptureLogger(logger) as cap_logger_3:
text_embeddings_3 = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
+ negative_prompt,
+ )
prompt = 100 * "@"
with CaptureLogger(logger) as cap_logger:
text_embeddings = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
+ negative_prompt,
+ )
negative_prompt = "Hello"
with CaptureLogger(logger) as cap_logger_2:
text_embeddings_2 = sd_pipe._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt, )
- assert (text_embeddings_3.shape == text_embeddings_2.shape ==
- text_embeddings.shape)
+ negative_prompt,
+ )
+ assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
assert text_embeddings.shape[1] == 77
assert cap_logger.out == cap_logger_2.out
assert cap_logger.out.count("@") == 25
@@ -279,68 +296,71 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_stable_diffusion_default_ddim(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base")
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.49493,
- 0.47896,
- 0.40798,
- 0.54214,
- 0.53212,
- 0.48202,
- 0.47656,
- 0.46329,
- 0.48506,
- ])
+ expected_slice = np.array(
+ [
+ 0.49493,
+ 0.47896,
+ 0.40798,
+ 0.54214,
+ 0.53212,
+ 0.48202,
+ 0.47656,
+ 0.46329,
+ 0.48506,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_pndm(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base")
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.49493,
- 0.47896,
- 0.40798,
- 0.54214,
- 0.53212,
- 0.48202,
- 0.47656,
- 0.46329,
- 0.48506,
- ])
+ expected_slice = np.array(
+ [
+ 0.49493,
+ 0.47896,
+ 0.40798,
+ 0.54214,
+ 0.53212,
+ 0.48202,
+ 0.47656,
+ 0.46329,
+ 0.48506,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
def test_stable_diffusion_k_lms(self):
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base")
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.1044,
- 0.13115,
- 0.111,
- 0.10141,
- 0.1144,
- 0.07215,
- 0.11332,
- 0.09693,
- 0.10006,
- ])
+ expected_slice = np.array(
+ [
+ 0.1044,
+ 0.13115,
+ 0.111,
+ 0.10141,
+ 0.1144,
+ 0.07215,
+ 0.11332,
+ 0.09693,
+ 0.10006,
+ ]
+ )
assert np.abs(image_slice - expected_slice).max() < 0.0001
# def test_stable_diffusion_attention_slicing(self):
@@ -363,8 +383,7 @@ def test_stable_diffusion_k_lms(self):
def test_stable_diffusion_text2img_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -372,40 +391,43 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.3862,
- -0.4507,
- -1.1729,
- 0.0686,
- -1.1045,
- 0.7124,
- -1.8301,
- 0.1903,
- 1.2773,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.3862,
+ -0.4507,
+ -1.1729,
+ 0.0686,
+ -1.1045,
+ 0.7124,
+ -1.8301,
+ 0.1903,
+ 1.2773,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 64)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.272,
- -0.1863,
- -0.7383,
- -0.5029,
- -0.7534,
- 0.397,
- -0.7646,
- 0.4468,
- 1.2686,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ 0.272,
+ -0.1863,
+ -0.7383,
+ -0.5029,
+ -0.7534,
+ 0.397,
+ -0.7646,
+ 0.4468,
+ 1.2686,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
callback_fn.has_been_called = False
pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16)
+ "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
@@ -437,8 +459,7 @@ def get_inputs(self, dtype="float32", seed=0):
return inputs
def test_stable_diffusion_2_0_default_ddim(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-base")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -449,8 +470,7 @@ def test_stable_diffusion_2_0_default_ddim(self):
assert max_diff < 0.01
def test_stable_diffusion_2_1_default_pndm(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -461,8 +481,7 @@ def test_stable_diffusion_2_1_default_pndm(self):
assert max_diff < 0.01
def test_stable_diffusion_ddim(self): # not pass
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
@@ -474,10 +493,8 @@ def test_stable_diffusion_ddim(self): # not pass
assert max_diff < 0.01
def test_stable_diffusion_lms(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -488,10 +505,8 @@ def test_stable_diffusion_lms(self):
assert max_diff < 0.01
def test_stable_diffusion_euler(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
- sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+ sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
image = sd_pipe(**inputs).images[0]
@@ -502,10 +517,8 @@ def test_stable_diffusion_euler(self):
assert max_diff < 0.01
def test_stable_diffusion_dpm(self): # not pass
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1-base")
- sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+ sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs()
inputs["num_inference_steps"] = 25
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 56aa066eb5a02..c63bfcf099735 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -20,9 +20,12 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- StableDiffusionAttendAndExcitePipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ StableDiffusionAttendAndExcitePipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import load_numpy, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -30,8 +33,7 @@
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionAttendAndExcitePipeline
test_attention_slicing = False
params = TEXT_TO_IMAGE_PARAMS
@@ -49,13 +51,15 @@ def get_dummy_components(self):
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_head_dim=(2, 4),
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -64,7 +68,8 @@ def get_dummy_components(self):
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
- sample_size=128, )
+ sample_size=128,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -77,10 +82,10 @@ def get_dummy_components(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -103,9 +108,7 @@ def get_dummy_inputs(self, seed=0):
"guidance_scale": 6.0,
"output_type": "numpy",
"max_iter_to_alter": 2,
- "thresholds": {
- (0): 0.7
- },
+ "thresholds": {(0): 0.7},
}
return inputs
@@ -117,17 +120,19 @@ def test_inference(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
self.assertEqual(image.shape, (1, 64, 64, 3))
- expected_slice = np.array([
- 0.33271241188049316,
- 0.3123358190059662,
- 0.44427454471588135,
- 0.08615309000015259,
- 0.26107650995254517,
- 0.4551312029361725,
- 0.06545555591583252,
- 0.1626836657524109,
- 0.3982071578502655,
- ])
+ expected_slice = np.array(
+ [
+ 0.33271241188049316,
+ 0.3123358190059662,
+ 0.44427454471588135,
+ 0.08615309000015259,
+ 0.26107650995254517,
+ 0.4551312029361725,
+ 0.06545555591583252,
+ 0.1626836657524109,
+ 0.3982071578502655,
+ ]
+ )
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 0.001)
@@ -149,7 +154,8 @@ def test_attend_and_excite_fp16(self):
pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
prompt = "a painting of an elephant with glasses"
token_indices = [5, 7]
@@ -160,7 +166,8 @@ def test_attend_and_excite_fp16(self):
generator=generator,
num_inference_steps=5,
max_iter_to_alter=5,
- output_type="numpy", ).images[0]
+ output_type="numpy",
+ ).images[0]
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 077ed16dba212..240b7ae56d4da 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -20,30 +20,39 @@
import numpy as np
import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
- CLIPTokenizer, DPTConfig,
- DPTForDepthEstimation, DPTImageProcessor)
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+ DPTConfig,
+ DPTForDepthEstimation,
+ DPTImageProcessor,
+)
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler, StableDiffusionDepth2ImgPipeline,
- UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionDepth2ImgPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionDepth2ImgPipeline
test_save_load_optional_components = False
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "latents"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
def get_dummy_components(self):
@@ -58,7 +67,8 @@ def get_dummy_components(self):
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_head_dim=(2, 4),
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -67,7 +77,8 @@ def get_dummy_components(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -78,10 +89,10 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
backbone_config = {
"global_padding": "same",
"layer_type": "bottleneck",
@@ -107,10 +118,10 @@ def get_dummy_components(self):
initializer_range=0.02,
is_hybrid=True,
backbone_config=backbone_config,
- backbone_featmap_shape=[1, 384, 24, 24], )
+ backbone_featmap_shape=[1, 384, 24, 24],
+ )
depth_estimator = DPTForDepthEstimation(depth_estimator_config)
- feature_extractor = DPTImageProcessor.from_pretrained(
- "hf-internal-testing/tiny-random-DPTForDepthEstimation")
+ feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -146,8 +157,7 @@ def test_save_load_local(self):
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir)
- pipe_loaded = self.pipeline_class.from_pretrained(
- tmpdir, from_diffusers=False)
+ pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
pipe_loaded.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output_loaded = pipe_loaded(**inputs)[0]
@@ -215,17 +225,19 @@ def test_stable_diffusion_depth2img_default_case(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.35397637,
- 0.23190483,
- 0.20131412,
- 0.27374774,
- 0.265134,
- 0.4502194,
- 0.26852018,
- 0.37504935,
- 0.43135768,
- ])
+ expected_slice = np.array(
+ [
+ 0.35397637,
+ 0.23190483,
+ 0.20131412,
+ 0.27374774,
+ 0.265134,
+ 0.4502194,
+ 0.26852018,
+ 0.37504935,
+ 0.43135768,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_depth2img_negative_prompt(self):
@@ -238,17 +250,19 @@ def test_stable_diffusion_depth2img_negative_prompt(self):
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.40259343,
- 0.37764466,
- 0.3936328,
- 0.3628915,
- 0.48100996,
- 0.59685427,
- 0.22927544,
- 0.45186657,
- 0.46950823,
- ])
+ expected_slice = np.array(
+ [
+ 0.40259343,
+ 0.37764466,
+ 0.3936328,
+ 0.3628915,
+ 0.48100996,
+ 0.59685427,
+ 0.22927544,
+ 0.45186657,
+ 0.46950823,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_depth2img_multiple_init_images(self):
@@ -261,17 +275,19 @@ def test_stable_diffusion_depth2img_multiple_init_images(self):
image = pipe(**inputs).images
image_slice = image[-1, -3:, -3:, -1]
assert image.shape == (2, 32, 32, 3)
- expected_slice = np.array([
- 0.8169553,
- 0.4573238,
- 0.27039874,
- 0.60622,
- 0.35670877,
- 0.39508212,
- 0.56803817,
- 0.5341117,
- 0.44428858,
- ])
+ expected_slice = np.array(
+ [
+ 0.8169553,
+ 0.4573238,
+ 0.27039874,
+ 0.60622,
+ 0.35670877,
+ 0.39508212,
+ 0.56803817,
+ 0.5341117,
+ 0.44428858,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
def test_stable_diffusion_depth2img_num_images_per_prompt(self):
@@ -288,14 +304,12 @@ def test_stable_diffusion_depth2img_num_images_per_prompt(self):
assert images.shape == (batch_size, 32, 32, 3)
num_images_per_prompt = 2
inputs = self.get_dummy_inputs()
- images = pipe(
- **inputs, num_images_per_prompt=num_images_per_prompt).images
+ images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
assert images.shape == (num_images_per_prompt, 32, 32, 3)
batch_size = 2
inputs = self.get_dummy_inputs()
inputs["prompt"] = [inputs["prompt"]] * batch_size
- images = pipe(
- **inputs, num_images_per_prompt=num_images_per_prompt).images
+ images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
def test_stable_diffusion_depth2img_pil(self):
@@ -305,17 +319,19 @@ def test_stable_diffusion_depth2img_pil(self):
inputs = self.get_dummy_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.35397637,
- 0.23190483,
- 0.20131412,
- 0.27374774,
- 0.265134,
- 0.4502194,
- 0.26852018,
- 0.37504935,
- 0.43135768,
- ])
+ expected_slice = np.array(
+ [
+ 0.35397637,
+ 0.23190483,
+ 0.20131412,
+ 0.27374774,
+ 0.265134,
+ 0.4502194,
+ 0.26852018,
+ 0.37504935,
+ 0.43135768,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
@@ -345,7 +361,8 @@ def get_inputs(self, dtype="float32", seed=0):
def test_stable_diffusion_depth2img_pipeline_default(self):
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+ "stabilityai/stable-diffusion-2-depth", safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
@@ -353,22 +370,25 @@ def test_stable_diffusion_depth2img_pipeline_default(self):
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 480, 640, 3)
# expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.826, 0.7747, 0.7421])
- expected_slice = np.array([
- 0.75446224,
- 0.746921,
- 0.7595095,
- 0.8161169,
- 0.8059271,
- 0.7999228,
- 0.9052905,
- 0.879215,
- 0.8690305,
- ])
+ expected_slice = np.array(
+ [
+ 0.75446224,
+ 0.746921,
+ 0.7595095,
+ 0.8161169,
+ 0.8059271,
+ 0.7999228,
+ 0.9052905,
+ 0.879215,
+ 0.8690305,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.1
def test_stable_diffusion_depth2img_pipeline_k_lms(self):
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+ "stabilityai/stable-diffusion-2-depth", safety_checker=None
+ )
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -377,22 +397,25 @@ def test_stable_diffusion_depth2img_pipeline_k_lms(self):
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 480, 640, 3)
# expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.637, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])
- expected_slice = np.array([
- 0.6395747,
- 0.64879197,
- 0.6566683,
- 0.6438427,
- 0.6707787,
- 0.63587487,
- 0.66576767,
- 0.62180007,
- 0.6628648,
- ])
+ expected_slice = np.array(
+ [
+ 0.6395747,
+ 0.64879197,
+ 0.6566683,
+ 0.6438427,
+ 0.6707787,
+ 0.63587487,
+ 0.66576767,
+ 0.62180007,
+ 0.6628648,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.1
def test_stable_diffusion_depth2img_pipeline_ddim(self):
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+ "stabilityai/stable-diffusion-2-depth", safety_checker=None
+ )
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
@@ -401,25 +424,26 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self):
image_slice = image[0, 253:256, 253:256, -1].flatten()
assert image.shape == (1, 480, 640, 3)
# expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.642, 0.6522, 0.6555, 0.6436])
- expected_slice = np.array([
- 0.6283968,
- 0.6419119,
- 0.6295293,
- 0.63652724,
- 0.6420511,
- 0.61574477,
- 0.62251365,
- 0.65826833,
- 0.6480877,
- ])
+ expected_slice = np.array(
+ [
+ 0.6283968,
+ 0.6419119,
+ 0.6295293,
+ 0.63652724,
+ 0.6420511,
+ 0.61574477,
+ 0.62251365,
+ 0.65826833,
+ 0.6480877,
+ ]
+ )
assert np.abs(expected_slice - image_slice).max() < 0.15
def test_stable_diffusion_depth2img_intermediate_state(self):
number_of_steps = 0
- def callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -427,25 +451,27 @@ def callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 60, 80)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -1.148,
- -0.2147,
- -0.618,
- -2.48,
- -2.348,
- 0.3945,
- -2.05,
- -1.566,
- -1.52,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.1
+ expected_slice = np.array(
+ [
+ -1.148,
+ -0.2147,
+ -0.618,
+ -2.48,
+ -2.348,
+ 0.3945,
+ -2.05,
+ -1.566,
+ -1.52,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.1
callback_fn.has_been_called = False
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-depth",
safety_checker=None,
- paddle_dtype=paddle.float16, )
+ paddle_dtype=paddle.float16,
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(dtype="float16")
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 6e0d5f33a5bdc..a926f2ed14718 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -22,18 +22,23 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from PIL import Image
-from ppdiffusers import (AutoencoderKL, PNDMScheduler,
- StableDiffusionInpaintPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ PNDMScheduler,
+ StableDiffusionInpaintPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image
from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -50,7 +55,8 @@ def get_dummy_components(self):
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_head_dim=(2, 4),
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
scheduler = PNDMScheduler(skip_prk_steps=True)
paddle.seed(0)
vae = AutoencoderKL(
@@ -60,7 +66,8 @@ def get_dummy_components(self):
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
- sample_size=128, )
+ sample_size=128,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -73,10 +80,10 @@ def get_dummy_components(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
text_encoder = CLIPTextModel(text_encoder_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -91,11 +98,8 @@ def get_dummy_components(self):
def get_dummy_inputs(self, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (64, 64)))
+ init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
generator = paddle.Generator().manual_seed(seed)
inputs = {
@@ -117,17 +121,19 @@ def test_stable_diffusion_inpaint(self):
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.58470726,
- 0.49302375,
- 0.3954028,
- 0.4068969,
- 0.33668613,
- 0.50350493,
- 0.34411103,
- 0.25261122,
- 0.4531455,
- ])
+ expected_slice = np.array(
+ [
+ 0.58470726,
+ 0.49302375,
+ 0.3954028,
+ 0.4068969,
+ 0.33668613,
+ 0.50350493,
+ 0.34411103,
+ 0.25261122,
+ 0.4531455,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
@@ -151,8 +157,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
# 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/yellow_cat_sitting_on_a_park_bench.npy'
# )
model_id = "stabilityai/stable-diffusion-2-inpainting"
- pipe = StableDiffusionInpaintPipeline.from_pretrained(
- model_id, safety_checker=None)
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
@@ -162,7 +167,8 @@ def test_stable_diffusion_inpaint_pipeline(self):
image=init_image,
mask_image=mask_image,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
assert image.shape == (512, 512, 3)
image = image[-3:, -3:, -1]
@@ -186,7 +192,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
# )
model_id = "stabilityai/stable-diffusion-2-inpainting"
pipe = StableDiffusionInpaintPipeline.from_pretrained(
- model_id, paddle_dtype=paddle.float16, safety_checker=None)
+ model_id, paddle_dtype=paddle.float16, safety_checker=None
+ )
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
@@ -196,7 +203,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
image=init_image,
mask_image=mask_image,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
assert image.shape == (512, 512, 3)
image = image[-3:, -3:, -1]
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 0224ae1e8b294..ec93a578bbaf2 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -21,19 +21,24 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, EulerDiscreteScheduler,
- StableDiffusionLatentUpscalePipeline,
- StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ EulerDiscreteScheduler,
+ StableDiffusionLatentUpscalePipeline,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionLatentUpscalePipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
"height",
@@ -42,9 +47,7 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin,
"negative_prompt_embeds",
"prompt_embeds",
}
- required_optional_params = PipelineTesterMixin.required_optional_params - {
- "num_images_per_prompt"
- }
+ required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
test_cpu_offload = False
@@ -53,8 +56,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 4
sizes = 16, 16
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
def get_dummy_components(self):
@@ -72,7 +74,8 @@ def get_dummy_components(self):
"KDownBlock2D",
"KCrossAttnDownBlock2D",
"KCrossAttnDownBlock2D",
- "KCrossAttnDownBlock2D", ),
+ "KCrossAttnDownBlock2D",
+ ),
in_channels=8,
mid_block_type=None,
only_cross_attention=False,
@@ -84,7 +87,9 @@ def get_dummy_components(self):
"KCrossAttnUpBlock2D",
"KCrossAttnUpBlock2D",
"KCrossAttnUpBlock2D",
- "KUpBlock2D", ), )
+ "KUpBlock2D",
+ ),
+ )
vae = AutoencoderKL(
block_out_channels=[32, 32, 64, 64],
in_channels=3,
@@ -101,7 +106,8 @@ def get_dummy_components(self):
"UpDecoderBlock2D",
"UpDecoderBlock2D",
],
- latent_channels=4, )
+ latent_channels=4,
+ )
scheduler = EulerDiscreteScheduler(prediction_type="sample")
text_config = CLIPTextConfig(
bos_token_id=0,
@@ -114,10 +120,10 @@ def get_dummy_components(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="quick_gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
text_encoder = CLIPTextModel(text_config).eval()
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": model.eval(),
"vae": vae.eval(),
@@ -147,17 +153,19 @@ def test_inference(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
self.assertEqual(image.shape, (1, 256, 256, 3))
- expected_slice = np.array([
- 0.5665861368179321,
- 0.7449524402618408,
- 0.0,
- 0.1325536072254181,
- 0.4274534583091736,
- 0.0,
- 0.0,
- 0.14426982402801514,
- 0.0,
- ])
+ expected_slice = np.array(
+ [
+ 0.5665861368179321,
+ 0.7449524402618408,
+ 0.0,
+ 0.1325536072254181,
+ 0.4274534583091736,
+ 0.0,
+ 0.0,
+ 0.14426982402801514,
+ 0.0,
+ ]
+ )
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 0.001)
@@ -175,25 +183,23 @@ def tearDown(self):
def test_latent_upscaler_fp16(self):
generator = paddle.Generator().manual_seed(seed=33)
- pipe = StableDiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
pipe.to("gpu")
upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
- "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16)
+ "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
+ )
upscaler.to("gpu")
- prompt = (
- "a photo of an astronaut high resolution, unreal engine, ultra realistic"
- )
- low_res_latents = pipe(
- prompt, generator=generator, output_type="latent").images
+ prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
+ low_res_latents = pipe(prompt, generator=generator, output_type="latent").images
image = upscaler(
prompt=prompt,
image=low_res_latents,
num_inference_steps=20,
guidance_scale=0,
generator=generator,
- output_type="np", ).images[0]
+ output_type="np",
+ ).images[0]
# invalid expected_image
# expected_image = load_numpy(
# "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy"
@@ -209,7 +215,8 @@ def test_latent_upscaler_fp16(self):
def test_latent_upscaler_fp16_image(self):
generator = paddle.Generator().manual_seed(seed=33)
upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
- "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16)
+ "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
+ )
upscaler.to("gpu")
prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
@@ -222,7 +229,8 @@ def test_latent_upscaler_fp16_image(self):
num_inference_steps=20,
guidance_scale=0,
generator=generator,
- output_type="np", ).images[0]
+ output_type="np",
+ ).images[0]
# invalid expected_image
# expected_image = load_numpy(
# "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy"
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
index ca4e467ebdca2..35a135bc747e3 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -22,8 +22,13 @@
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from PIL import Image
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- StableDiffusionUpscalePipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ StableDiffusionUpscalePipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import floats_tensor, load_image, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -39,8 +44,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = (32, 32)
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -55,15 +59,16 @@ def dummy_cond_unet_upscale(self):
down_block_types=(
"DownBlock2D",
"CrossAttnDownBlock2D",
- "CrossAttnDownBlock2D", ),
- up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D",
- "UpBlock2D"),
+ "CrossAttnDownBlock2D",
+ ),
+ up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
# SD2-specific config below
attention_head_dim=8,
use_linear_projection=True,
only_cross_attention=(True, True, False),
- num_class_embeds=100, )
+ num_class_embeds=100,
+ )
return model
@property
@@ -78,10 +83,9 @@ def dummy_vae(self):
"DownEncoderBlock2D",
"DownEncoderBlock2D",
],
- up_block_types=[
- "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"
- ],
- latent_channels=4, )
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
return model
@property
@@ -99,7 +103,8 @@ def dummy_text_encoder(self):
vocab_size=1000,
# SD2-specific config below
hidden_act="gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
return CLIPTextModel(config).eval()
def test_stable_diffusion_upscale(self):
@@ -108,11 +113,9 @@ def test_stable_diffusion_upscale(self):
scheduler = DDIMScheduler(prediction_type="v_prediction")
vae = self.dummy_vae
text_encoder = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
+ low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
sd_pipe = StableDiffusionUpscalePipeline(
unet=unet,
low_res_scheduler=low_res_scheduler,
@@ -120,7 +123,8 @@ def test_stable_diffusion_upscale(self):
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
- max_noise_level=350, )
+ max_noise_level=350,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -131,7 +135,8 @@ def test_stable_diffusion_upscale(self):
guidance_scale=6.0,
noise_level=20,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -142,26 +147,27 @@ def test_stable_diffusion_upscale(self):
noise_level=20,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
expected_height_width = low_res_image.size[0] * 4
- assert image.shape == (1, expected_height_width, expected_height_width,
- 3)
- expected_slice = np.array([
- 0.0,
- 0.0,
- 0.3616839,
- 0.0,
- 0.04877859,
- 0.59195685,
- 0.23902711,
- 0.00838843,
- 0.5172206,
- ])
+ assert image.shape == (1, expected_height_width, expected_height_width, 3)
+ expected_slice = np.array(
+ [
+ 0.0,
+ 0.0,
+ 0.3616839,
+ 0.0,
+ 0.04877859,
+ 0.59195685,
+ 0.23902711,
+ 0.00838843,
+ 0.5172206,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_upscale_batch(self):
unet = self.dummy_cond_unet_upscale
@@ -169,11 +175,9 @@ def test_stable_diffusion_upscale_batch(self):
scheduler = DDIMScheduler(prediction_type="v_prediction")
vae = self.dummy_vae
text_encoder = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
+ low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
sd_pipe = StableDiffusionUpscalePipeline(
unet=unet,
low_res_scheduler=low_res_scheduler,
@@ -181,7 +185,8 @@ def test_stable_diffusion_upscale_batch(self):
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
- max_noise_level=350, )
+ max_noise_level=350,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
output = sd_pipe(
@@ -190,7 +195,8 @@ def test_stable_diffusion_upscale_batch(self):
guidance_scale=6.0,
noise_level=20,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
assert image.shape[0] == 2
generator = paddle.Generator().manual_seed(0)
@@ -202,7 +208,8 @@ def test_stable_diffusion_upscale_batch(self):
guidance_scale=6.0,
noise_level=20,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
assert image.shape[0] == 2
@@ -213,11 +220,9 @@ def test_stable_diffusion_upscale_fp16(self):
scheduler = DDIMScheduler(prediction_type="v_prediction")
vae = self.dummy_vae
text_encoder = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
- low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
- (64, 64))
+ low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
unet = unet.to(dtype=paddle.float16)
text_encoder = text_encoder.to(dtype=paddle.float16)
sd_pipe = StableDiffusionUpscalePipeline(
@@ -227,7 +232,8 @@ def test_stable_diffusion_upscale_fp16(self):
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
- max_noise_level=350, )
+ max_noise_level=350,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -236,10 +242,10 @@ def test_stable_diffusion_upscale_fp16(self):
image=low_res_image,
generator=generator,
num_inference_steps=2,
- output_type="np", ).images
+ output_type="np",
+ ).images
expected_height_width = low_res_image.size[0] * 4
- assert image.shape == (1, expected_height_width, expected_height_width,
- 3)
+ assert image.shape == (1, expected_height_width, expected_height_width, 3)
@slow
@@ -264,8 +270,7 @@ def test_stable_diffusion_upscale_pipeline(self):
pipe.enable_attention_slicing()
prompt = "a cat sitting on a park bench"
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- prompt=prompt, image=image, generator=generator, output_type="np")
+ output = pipe(prompt=prompt, image=image, generator=generator, output_type="np")
image = output.images[0]
assert image.shape == (512, 512, 3)
image = image[-3:, -3:, -1]
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index daa755dc68597..b482ca6657633 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -20,9 +20,14 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- DPMSolverMultistepScheduler, EulerDiscreteScheduler,
- StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerDiscreteScheduler,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -46,7 +51,8 @@ def dummy_cond_unet(self):
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_head_dim=(2, 4),
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
return model
@property
@@ -59,7 +65,8 @@ def dummy_vae(self):
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
- sample_size=128, )
+ sample_size=128,
+ )
return model
@property
@@ -76,7 +83,8 @@ def dummy_text_encoder(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
- projection_dim=64, )
+ projection_dim=64,
+ )
return CLIPTextModel(config).eval()
def test_stable_diffusion_v_pred_ddim(self):
@@ -87,11 +95,11 @@ def test_stable_diffusion_v_pred_ddim(self):
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
- prediction_type="v_prediction", )
+ prediction_type="v_prediction",
+ )
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -100,7 +108,8 @@ def test_stable_diffusion_v_pred_ddim(self):
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -109,7 +118,8 @@ def test_stable_diffusion_v_pred_ddim(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -118,24 +128,26 @@ def test_stable_diffusion_v_pred_ddim(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.36126757,
- 0.40778637,
- 0.36956796,
- 0.14816678,
- 0.25735706,
- 0.36562037,
- 0.1229952,
- 0.22826642,
- 0.4154452,
- ])
+ expected_slice = np.array(
+ [
+ 0.36126757,
+ 0.40778637,
+ 0.36956796,
+ 0.14816678,
+ 0.25735706,
+ 0.36562037,
+ 0.1229952,
+ 0.22826642,
+ 0.4154452,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_v_pred_k_euler(self):
unet = self.dummy_cond_unet
@@ -143,11 +155,11 @@ def test_stable_diffusion_v_pred_k_euler(self):
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
- prediction_type="v_prediction", )
+ prediction_type="v_prediction",
+ )
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -156,7 +168,8 @@ def test_stable_diffusion_v_pred_k_euler(self):
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -165,7 +178,8 @@ def test_stable_diffusion_v_pred_k_euler(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -174,24 +188,26 @@ def test_stable_diffusion_v_pred_k_euler(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.39991996,
- 0.45191997,
- 0.34044766,
- 0.2136086,
- 0.2758901,
- 0.31222183,
- 0.21658134,
- 0.34479994,
- 0.43742967,
- ])
+ expected_slice = np.array(
+ [
+ 0.39991996,
+ 0.45191997,
+ 0.34044766,
+ 0.2136086,
+ 0.2758901,
+ 0.31222183,
+ 0.21658134,
+ 0.34479994,
+ 0.43742967,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_v_pred_fp16(self):
"""Test that stable diffusion v-prediction works with fp16"""
@@ -202,11 +218,11 @@ def test_stable_diffusion_v_pred_fp16(self):
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
- prediction_type="v_prediction", )
+ prediction_type="v_prediction",
+ )
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
unet = unet.to(dtype=paddle.float16)
vae = vae.to(dtype=paddle.float16)
bert = bert.to(dtype=paddle.float16)
@@ -218,15 +234,12 @@ def test_stable_diffusion_v_pred_fp16(self):
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
- image = sd_pipe(
- [prompt],
- generator=generator,
- num_inference_steps=2,
- output_type="np").images
+ image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
assert image.shape == (1, 64, 64, 3)
@@ -239,8 +252,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_stable_diffusion_v_pred_default(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
sd_pipe.enable_attention_slicing()
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
@@ -250,26 +262,30 @@ def test_stable_diffusion_v_pred_default(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=20,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 768, 768, 3)
- expected_slice = np.array([
- 0.05667132,
- 0.05700234,
- 0.04156408,
- 0.04631725,
- 0.04327643,
- 0.06003231,
- 0.05165312,
- 0.05258191,
- 0.0865913,
- ])
+ expected_slice = np.array(
+ [
+ 0.05667132,
+ 0.05700234,
+ 0.04156408,
+ 0.04631725,
+ 0.04327643,
+ 0.06003231,
+ 0.05165312,
+ 0.05258191,
+ 0.0865913,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_v_pred_upcast_attention(self):
sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16)
+ "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16
+ )
sd_pipe.enable_attention_slicing()
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
@@ -279,52 +295,51 @@ def test_stable_diffusion_v_pred_upcast_attention(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=20,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 768, 768, 3)
- expected_slice = np.array([
- 0.04541016,
- 0.04516602,
- 0.05493164,
- 0.05078125,
- 0.04296875,
- 0.07275391,
- 0.06567383,
- 0.0534668,
- 0.04833984,
- ])
+ expected_slice = np.array(
+ [
+ 0.04541016,
+ 0.04516602,
+ 0.05493164,
+ 0.05078125,
+ 0.04296875,
+ 0.07275391,
+ 0.06567383,
+ 0.0534668,
+ 0.04833984,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
def test_stable_diffusion_v_pred_euler(self):
- scheduler = EulerDiscreteScheduler.from_pretrained(
- "stabilityai/stable-diffusion-2", subfolder="scheduler")
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2", scheduler=scheduler)
+ scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
sd_pipe.enable_attention_slicing()
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
- output = sd_pipe(
- [prompt],
- generator=generator,
- num_inference_steps=5,
- output_type="numpy")
+ output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
image = output.images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 768, 768, 3)
- expected_slice = np.array([
- 0.03515199,
- 0.03756374,
- 0.05046153,
- 0.04240236,
- 0.05509549,
- 0.06556576,
- 0.04710263,
- 0.02758819,
- 0.05959105,
- ])
+ expected_slice = np.array(
+ [
+ 0.03515199,
+ 0.03756374,
+ 0.05046153,
+ 0.04240236,
+ 0.05509549,
+ 0.06556576,
+ 0.04710263,
+ 0.02758819,
+ 0.05959105,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_v_pred_dpm(self):
@@ -332,9 +347,9 @@ def test_stable_diffusion_v_pred_dpm(self):
TODO: update this test after making DPM compatible with V-prediction!
"""
scheduler = DPMSolverMultistepScheduler.from_pretrained(
- "stabilityai/stable-diffusion-2", subfolder="scheduler")
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2", scheduler=scheduler)
+ "stabilityai/stable-diffusion-2", subfolder="scheduler"
+ )
+ sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
sd_pipe.enable_attention_slicing()
sd_pipe.set_progress_bar_config(disable=None)
prompt = "a photograph of an astronaut riding a horse"
@@ -344,20 +359,23 @@ def test_stable_diffusion_v_pred_dpm(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=5,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 768, 768, 3)
- expected_slice = np.array([
- 0.20492354,
- 0.2115368,
- 0.2323401,
- 0.2415919,
- 0.25598443,
- 0.24843931,
- 0.25171167,
- 0.23580211,
- 0.23604062,
- ])
+ expected_slice = np.array(
+ [
+ 0.20492354,
+ 0.2115368,
+ 0.2323401,
+ 0.2415919,
+ 0.25598443,
+ 0.24843931,
+ 0.25171167,
+ 0.23580211,
+ 0.23604062,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
# def test_stable_diffusion_attention_slicing_v_pred(self):
@@ -387,30 +405,27 @@ def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
# expected_image = load_numpy(
# 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred.npy'
# )
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2")
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
pipe.enable_attention_slicing()
pipe.set_progress_bar_config(disable=None)
prompt = "astronaut riding a horse"
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- prompt=prompt,
- guidance_scale=7.5,
- generator=generator,
- output_type="np")
+ output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
image = output.images[0]
assert image.shape == (768, 768, 3)
- expected_image = np.array([
- 0.26713198,
- 0.2630347,
- 0.25486767,
- 0.23375505,
- 0.24399692,
- 0.22363415,
- 0.24688962,
- 0.21346492,
- 0.23014635,
- ])
+ expected_image = np.array(
+ [
+ 0.26713198,
+ 0.2630347,
+ 0.25486767,
+ 0.23375505,
+ 0.24399692,
+ 0.22363415,
+ 0.24688962,
+ 0.21346492,
+ 0.23014635,
+ ]
+ )
image = image[-3:, -3:, -1].flatten()
assert np.abs(expected_image - image).max() < 0.075
@@ -419,37 +434,33 @@ def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
# expected_image = load_numpy(
# 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy'
# )
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
prompt = "astronaut riding a horse"
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- prompt=prompt,
- guidance_scale=7.5,
- generator=generator,
- output_type="np")
+ output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
image = output.images[0]
assert image.shape == (768, 768, 3)
- expected_image = np.array([
- 0.26220703,
- 0.25195312,
- 0.2434082,
- 0.22753906,
- 0.23632812,
- 0.21777344,
- 0.23901367,
- 0.20629883,
- 0.22192383,
- ])
+ expected_image = np.array(
+ [
+ 0.26220703,
+ 0.25195312,
+ 0.2434082,
+ 0.22753906,
+ 0.23632812,
+ 0.21777344,
+ 0.23901367,
+ 0.20629883,
+ 0.22192383,
+ ]
+ )
image = image[-3:, -3:, -1].flatten()
assert np.abs(expected_image - image).max() < 0.75
def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
number_of_steps = 0
- def test_callback_fn(step: int, timestep: int,
- latents: paddle.Tensor) -> None:
+ def test_callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
test_callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
@@ -457,40 +468,41 @@ def test_callback_fn(step: int, timestep: int,
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 96, 96)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.2542,
- -1.276,
- 0.426,
- -0.956,
- -1.173,
- -0.5884,
- 2.416,
- 0.1553,
- -1.21,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.2542,
+ -1.276,
+ 0.426,
+ -0.956,
+ -1.173,
+ -0.5884,
+ 2.416,
+ 0.1553,
+ -1.21,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
elif step == 19:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 96, 96)
latents_slice = latents[0, -3:, -3:, -1]
- expected_slice = np.array([
- -0.959,
- -0.964,
- -0.614,
- 0.0977,
- -0.6953,
- -0.2343,
- 1.551,
- -0.03357,
- -0.11395,
- ])
- assert np.abs(latents_slice.flatten() - expected_slice).max(
- ) < 0.05
+ expected_slice = np.array(
+ [
+ -0.959,
+ -0.964,
+ -0.614,
+ 0.0977,
+ -0.6953,
+ -0.2343,
+ 1.551,
+ -0.03357,
+ -0.11395,
+ ]
+ )
+ assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
test_callback_fn.has_been_called = False
- pipe = StableDiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
prompt = "Andromeda galaxy in a bottle"
@@ -501,6 +513,7 @@ def test_callback_fn(step: int, timestep: int,
guidance_scale=7.5,
generator=generator,
callback=test_callback_fn,
- callback_steps=1, )
+ callback_steps=1,
+ )
assert test_callback_fn.has_been_called
assert number_of_steps == 20
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index 2bfa1261d9065..b2bdac5b34ed7 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -22,10 +22,16 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
- PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion_safe import \
- StableDiffusionPipelineSafe as StableDiffusionPipeline
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion_safe import (
+ StableDiffusionPipelineSafe as StableDiffusionPipeline,
+)
from ppdiffusers.utils import floats_tensor, nightly
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -41,8 +47,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
@property
@@ -56,7 +61,8 @@ def dummy_cond_unet(self):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -68,7 +74,8 @@ def dummy_vae(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
return model
@property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
@property
@@ -108,11 +116,11 @@ def test_safe_diffusion_ddim(self):
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -120,7 +128,8 @@ def test_safe_diffusion_ddim(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -129,7 +138,8 @@ def test_safe_diffusion_ddim(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -138,32 +148,33 @@ def test_safe_diffusion_ddim(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.28519452,
- 0.23807159,
- 0.38150585,
- 0.21930319,
- 0.26092738,
- 0.517212,
- 0.2563907,
- 0.2503956,
- 0.47978917,
- ])
+ expected_slice = np.array(
+ [
+ 0.28519452,
+ 0.23807159,
+ 0.38150585,
+ 0.21930319,
+ 0.26092738,
+ 0.517212,
+ 0.2563907,
+ 0.2503956,
+ 0.47978917,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_pndm(self):
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -171,7 +182,8 @@ def test_stable_diffusion_pndm(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = paddle.Generator().manual_seed(0)
@@ -180,7 +192,8 @@ def test_stable_diffusion_pndm(self):
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = sd_pipe(
@@ -189,29 +202,31 @@ def test_stable_diffusion_pndm(self):
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
- return_dict=False, )[0]
+ return_dict=False,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 0.18763152,
- 0.24242553,
- 0.36067978,
- 0.21772456,
- 0.27213728,
- 0.5194623,
- 0.2227565,
- 0.2217454,
- 0.4453961,
- ])
+ expected_slice = np.array(
+ [
+ 0.18763152,
+ 0.24242553,
+ 0.36067978,
+ 0.21772456,
+ 0.27213728,
+ 0.5194623,
+ 0.2227565,
+ 0.2217454,
+ 0.4453961,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_stable_diffusion_no_safety_checker(self):
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+ )
assert isinstance(pipe, StableDiffusionPipeline)
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
assert pipe.safety_checker is None
@@ -219,8 +234,7 @@ def test_stable_diffusion_no_safety_checker(self):
assert image is not None
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = StableDiffusionPipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
@@ -231,8 +245,7 @@ def test_stable_diffusion_fp16(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
unet = unet.to(dtype=paddle.float16)
vae = vae.to(dtype=paddle.float16)
bert = bert.to(dtype=paddle.float16)
@@ -243,11 +256,11 @@ def test_stable_diffusion_fp16(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
- image = sd_pipe(
- [prompt], num_inference_steps=2, output_type="np").images
+ image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
assert image.shape == (1, 64, 64, 3)
@@ -260,10 +273,8 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_harm_safe_stable_diffusion(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5", safety_checker=None)
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and children from bahnhof zoo, detailed "
seed = 4003660346
@@ -277,7 +288,8 @@ def test_harm_safe_stable_diffusion(self):
output_type="np",
width=512,
height=512,
- sld_guidance_scale=0, )
+ sld_guidance_scale=0,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -306,7 +318,8 @@ def test_harm_safe_stable_diffusion(self):
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
- sld_mom_beta=0.7, )
+ sld_mom_beta=0.7,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -324,10 +337,8 @@ def test_harm_safe_stable_diffusion(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_nudity_safe_stable_diffusion(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5", safety_checker=None)
- sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
- sd_pipe.scheduler.config)
+ sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+ sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
seed = 2734971755
@@ -341,7 +352,8 @@ def test_nudity_safe_stable_diffusion(self):
output_type="np",
width=512,
height=512,
- sld_guidance_scale=0, )
+ sld_guidance_scale=0,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -370,7 +382,8 @@ def test_nudity_safe_stable_diffusion(self):
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
- sld_mom_beta=0.7, )
+ sld_mom_beta=0.7,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
@@ -388,8 +401,7 @@ def test_nudity_safe_stable_diffusion(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_nudity_safetychecker_safe_stable_diffusion(self):
- sd_pipe = StableDiffusionPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5")
+ sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
sd_pipe.set_progress_bar_config(disable=None)
prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
seed = 1044355234
@@ -403,7 +415,8 @@ def test_nudity_safetychecker_safe_stable_diffusion(self):
output_type="np",
width=512,
height=512,
- sld_guidance_scale=0, )
+ sld_guidance_scale=0,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
@@ -422,12 +435,10 @@ def test_nudity_safetychecker_safe_stable_diffusion(self):
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
- sld_mom_beta=0.7, )
+ sld_mom_beta=0.7,
+ )
image = output.images
image_slice = image[0, -3:, -3:, -1]
- expected_slice = np.array([
- 0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334,
- 0.6561
- ])
+ expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
index 79cfcb2145995..fb5982706c2c9 100644
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -16,14 +16,24 @@
import unittest
import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
- CLIPTextModelWithProjection, CLIPTokenizer)
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- PriorTransformer, StableUnCLIPPipeline,
- UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \
- StableUnCLIPImageNormalizer
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ PriorTransformer,
+ StableUnCLIPPipeline,
+ UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
+ StableUnCLIPImageNormalizer,
+)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -39,8 +49,7 @@ def get_dummy_components(self):
embedder_hidden_size = 32
embedder_projection_dim = embedder_hidden_size
paddle.seed(0)
- prior_tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
paddle.seed(0)
prior_text_encoder = CLIPTextModelWithProjection(
CLIPTextConfig(
@@ -53,13 +62,16 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, ))
+ vocab_size=1000,
+ )
+ )
paddle.seed(0)
prior = PriorTransformer(
num_attention_heads=2,
attention_head_dim=12,
embedding_dim=embedder_projection_dim,
- num_layers=1, )
+ num_layers=1,
+ )
paddle.seed(0)
prior_scheduler = DDPMScheduler(
variance_type="fixed_small_log",
@@ -67,15 +79,13 @@ def get_dummy_components(self):
num_train_timesteps=1000,
clip_sample=True,
clip_sample_range=5.0,
- beta_schedule="squaredcos_cap_v2", )
+ beta_schedule="squaredcos_cap_v2",
+ )
paddle.seed(0)
- image_normalizer = StableUnCLIPImageNormalizer(
- embedding_dim=embedder_hidden_size)
- image_noising_scheduler = DDPMScheduler(
- beta_schedule="squaredcos_cap_v2")
+ image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+ image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
paddle.seed(0)
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
paddle.seed(0)
text_encoder = CLIPTextModel(
CLIPTextConfig(
@@ -88,7 +98,9 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, ))
+ vocab_size=1000,
+ )
+ )
paddle.seed(0)
unet = UNet2DConditionModel(
sample_size=32,
@@ -103,7 +115,8 @@ def get_dummy_components(self):
cross_attention_dim=embedder_hidden_size,
layers_per_block=1,
upcast_attention=True,
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
paddle.seed(0)
scheduler = DDIMScheduler(
beta_schedule="scaled_linear",
@@ -111,7 +124,8 @@ def get_dummy_components(self):
beta_end=0.012,
prediction_type="v_prediction",
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
paddle.seed(0)
vae = AutoencoderKL()
components = {
@@ -143,13 +157,11 @@ def get_dummy_inputs(self, seed=0):
def test_attention_slicing_forward_pass(self):
test_max_difference = False
- self._test_attention_slicing_forward_pass(
- test_max_difference=test_max_difference)
+ self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
def test_inference_batch_single_identical(self):
test_max_difference = False
- self._test_inference_batch_single_identical(
- test_max_difference=test_max_difference)
+ self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
# @slow
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index aa2328fb72a16..eb769ee92815b 100644
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -19,24 +19,36 @@
import numpy as np
import paddle
from paddlenlp.transformers import (
- CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer,
- CLIPVisionConfig, CLIPVisionModelWithProjection)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
- StableUnCLIPImg2ImgPipeline, UNet2DConditionModel)
+ CLIPImageProcessor,
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DDPMScheduler,
+ StableUnCLIPImg2ImgPipeline,
+ UNet2DConditionModel,
+)
from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \
- StableUnCLIPImageNormalizer
+from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
+ StableUnCLIPImageNormalizer,
+)
from ppdiffusers.utils.import_utils import is_ppxformers_available
from ppdiffusers.utils.testing_utils import floats_tensor
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
from ..test_pipelines_common import PipelineTesterMixin
-class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableUnCLIPImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
@@ -53,15 +65,14 @@ def get_dummy_components(self):
num_attention_heads=4,
image_size=32,
intermediate_size=37,
- patch_size=1, ))
+ patch_size=1,
+ )
+ )
paddle.seed(0)
- image_normalizer = StableUnCLIPImageNormalizer(
- embedding_dim=embedder_hidden_size)
- image_noising_scheduler = DDPMScheduler(
- beta_schedule="squaredcos_cap_v2")
+ image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+ image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
paddle.seed(0)
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
paddle.seed(0)
text_encoder = CLIPTextModel(
CLIPTextConfig(
@@ -74,7 +85,9 @@ def get_dummy_components(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, ))
+ vocab_size=1000,
+ )
+ )
paddle.seed(0)
unet = UNet2DConditionModel(
sample_size=32,
@@ -89,7 +102,8 @@ def get_dummy_components(self):
cross_attention_dim=embedder_hidden_size,
layers_per_block=1,
upcast_attention=True,
- use_linear_projection=True, )
+ use_linear_projection=True,
+ )
paddle.seed(0)
scheduler = DDIMScheduler(
beta_schedule="scaled_linear",
@@ -97,7 +111,8 @@ def get_dummy_components(self):
beta_end=0.012,
prediction_type="v_prediction",
set_alpha_to_one=False,
- steps_offset=1, )
+ steps_offset=1,
+ )
paddle.seed(0)
vae = AutoencoderKL()
components = {
@@ -124,17 +139,19 @@ def test_image_embeds_none(self):
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array([
- 0.40317363,
- 1.0,
- 0.5802471,
- 0.47334313,
- 0.39546987,
- 0.72409034,
- 0.15691131,
- 0.42981434,
- 0.72585064,
- ])
+ expected_slice = np.array(
+ [
+ 0.40317363,
+ 1.0,
+ 0.5802471,
+ 0.47334313,
+ 0.39546987,
+ 0.72409034,
+ 0.15691131,
+ 0.42981434,
+ 0.72585064,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
@@ -145,8 +162,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
if pil_image:
input_image = input_image * 0.5 + 0.5
input_image = input_image.clip(min=0, max=1)
- input_image = (input_image.cpu().transpose(
- perm=[0, 2, 3, 1]).cast("float32").numpy())
+ input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
return {
"prompt": "An anime racoon running a marathon",
@@ -158,21 +174,18 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
def test_attention_slicing_forward_pass(self):
test_max_difference = False
- self._test_attention_slicing_forward_pass(
- test_max_difference=test_max_difference)
+ self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
def test_inference_batch_single_identical(self):
test_max_difference = False
- self._test_inference_batch_single_identical(
- test_max_difference=test_max_difference)
+ self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
@unittest.skipIf(
not is_ppxformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- test_max_difference=False)
+ self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
# @slow
diff --git a/ppdiffusers/tests/pipelines/test_pipelines.py b/ppdiffusers/tests/pipelines/test_pipelines.py
index ce6bcc0752a00..ef0b785f3ed4a 100644
--- a/ppdiffusers/tests/pipelines/test_pipelines.py
+++ b/ppdiffusers/tests/pipelines/test_pipelines.py
@@ -18,7 +18,6 @@
import os
import random
import shutil
-import sys
import tempfile
import unittest
import unittest.mock as mock
@@ -29,24 +28,50 @@
import requests_mock
import safetensors.torch
from paddlenlp.transformers import (
- CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer)
+ CLIPImageProcessor,
+ CLIPModel,
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTokenizer,
+)
from parameterized import parameterized
from PIL import Image
from requests.exceptions import HTTPError
from ppdiffusers import (
- AutoencoderKL, DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler,
- DiffusionPipeline, DPMSolverMultistepScheduler,
- EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
- LMSDiscreteScheduler, PNDMScheduler, StableDiffusionImg2ImgPipeline,
- StableDiffusionInpaintPipelineLegacy, StableDiffusionPipeline,
- UNet2DConditionModel, UNet2DModel, logging)
+ AutoencoderKL,
+ DDIMPipeline,
+ DDIMScheduler,
+ DDPMPipeline,
+ DDPMScheduler,
+ DiffusionPipeline,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipelineLegacy,
+ StableDiffusionPipeline,
+ UNet2DConditionModel,
+ UNet2DModel,
+ logging,
+)
from ppdiffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from ppdiffusers.utils import (CONFIG_NAME, TORCH_WEIGHTS_NAME, floats_tensor,
- nightly, slow)
-from ppdiffusers.utils.testing_utils import (CaptureLogger, get_tests_dir,
- require_compel, require_paddle_gpu,
- require_torch)
+from ppdiffusers.utils import (
+ CONFIG_NAME,
+ TORCH_WEIGHTS_NAME,
+ floats_tensor,
+ nightly,
+ slow,
+)
+from ppdiffusers.utils.testing_utils import (
+ CaptureLogger,
+ get_tests_dir,
+ require_compel,
+ require_paddle_gpu,
+ require_torch,
+)
class DownloadTests(unittest.TestCase):
@@ -57,12 +82,12 @@ def test_one_request_upon_cached(self):
"hf-internal-testing/tiny-stable-diffusion-pipe",
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
download_requests = [r.method for r in m.request_history]
assert download_requests.count("HEAD") == 15, "15 calls to files"
- assert (download_requests.count("GET") == 17
- ), "15 calls to files + model_info + model_index.json"
+ assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json"
assert (
len(download_requests) == 32
), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json"
@@ -73,11 +98,11 @@ def test_one_request_upon_cached(self):
safety_checker=None,
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
cache_requests = [r.method for r in m.request_history]
- assert cache_requests.count(
- "HEAD") == 1, "model_index.json is only HEAD"
+ assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
assert cache_requests.count("GET") == 1, "model info is only GET"
assert (
len(cache_requests) == 2
@@ -90,7 +115,8 @@ def test_less_downloads_passed_object(self):
safety_checker=None,
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
# make sure safety checker is not downloaded
assert "safety_checker" not in os.listdir(cached_folder)
@@ -112,14 +138,14 @@ def test_less_downloads_passed_object_calls(self):
safety_checker=None,
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
download_requests = [r.method for r in m.request_history]
# 15 - 2 because no call to config or model file for `safety_checker`
assert download_requests.count("HEAD") == 13, "13 calls to files"
# 17 - 2 because no call to config or model file for `safety_checker`
- assert (download_requests.count("GET") == 15
- ), "13 calls to files + model_info + model_index.json"
+ assert download_requests.count("GET") == 15, "13 calls to files + model_info + model_index.json"
assert (
len(download_requests) == 28
), "2 calls per file (13 files) + send_telemetry, model_info and model_index.json"
@@ -130,11 +156,11 @@ def test_less_downloads_passed_object_calls(self):
safety_checker=None,
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
cache_requests = [r.method for r in m.request_history]
- assert cache_requests.count(
- "HEAD") == 1, "model_index.json is only HEAD"
+ assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
assert cache_requests.count("GET") == 1, "model info is only GET"
assert (
len(cache_requests) == 2
@@ -147,15 +173,11 @@ def test_download_only_pytorch(self):
safety_checker=None,
cache_dir=tmpdirname,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
- all_root_files = [
- t[-1]
- for t in os.walk(
- os.path.join(tmpdirname,
- os.listdir(tmpdirname)[0], "snapshots"))
- ]
+ all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
files = [item for sublist in all_root_files for item in sublist]
assert not any(f.endswith(".msgpack") for f in files)
assert not any(f.endswith(".safetensors") for f in files)
@@ -163,25 +185,18 @@ def test_download_only_pytorch(self):
def test_returned_cached_folder(self):
prompt = "hello"
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
_, local_path = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch",
safety_checker=None,
- return_cached_folder=True, )
+ return_cached_folder=True,
+ )
pipe_2 = StableDiffusionPipeline.from_pretrained(local_path)
generator = paddle.Generator().manual_seed(0)
- out = pipe(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
- out_2 = pipe_2(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
assert np.max(np.abs(out - out_2)) < 0.001
def test_force_safetensors_error(self):
@@ -194,7 +209,8 @@ def test_force_safetensors_error(self):
from_diffusers=True,
safety_checker=None,
cache_dir=tmpdirname,
- use_safetensors=True, )
+ use_safetensors=True,
+ )
def test_download_safetensors(self):
with tempfile.TemporaryDirectory() as tmpdirname:
@@ -204,7 +220,8 @@ def test_download_safetensors(self):
from_diffusers=True,
safety_checker=None,
cache_dir=tmpdirname,
- use_safetensors=True, )
+ use_safetensors=True,
+ )
all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
files = [item for sublist in all_root_files for item in sublist]
@@ -219,11 +236,10 @@ def test_download_safetensors_index(self):
use_safetensors=True,
variant=variant,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
- all_root_files = [
- t[-1] for t in os.walk(os.path.join(tmpdirname))
- ]
+ all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
files = [item for sublist in all_root_files for item in sublist]
# None of the downloaded files should be a safetensors file even if we have some here:
@@ -246,11 +262,10 @@ def test_download_bin_index(self):
use_safetensors=False,
variant=variant,
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
- all_root_files = [
- t[-1] for t in os.walk(os.path.join(tmpdirname))
- ]
+ all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
files = [item for sublist in all_root_files for item in sublist]
# None of the downloaded files should be a safetensors file even if we have some here:
@@ -267,66 +282,39 @@ def test_download_bin_index(self):
def test_download_no_safety_checker(self):
prompt = "hello"
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
generator = paddle.Generator().manual_seed(0)
- out = pipe(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
- pipe_2 = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch")
+ out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
+ pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
generator = paddle.Generator().manual_seed(0)
- out_2 = pipe_2(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
assert np.max(np.abs(out - out_2)) < 0.001
def test_load_no_safety_checker_explicit_locally(self):
prompt = "hello"
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
generator = paddle.Generator().manual_seed(0)
- out = pipe(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe_2 = StableDiffusionPipeline.from_pretrained(
- tmpdirname, safety_checker=None)
+ pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
generator = paddle.Generator().manual_seed(0)
- out_2 = pipe_2(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
assert np.max(np.abs(out - out_2)) < 0.001
def test_load_no_safety_checker_default_locally(self):
prompt = "hello"
- pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch")
+ pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
generator = paddle.Generator().manual_seed(0)
- out = pipe(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
generator = paddle.Generator().manual_seed(0)
- out_2 = pipe_2(
- prompt,
- num_inference_steps=2,
- generator=generator,
- output_type="numpy").images
+ out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
assert np.max(np.abs(out - out_2)) < 0.001
def test_cached_files_are_used_when_no_internet(self):
@@ -336,21 +324,16 @@ def test_cached_files_are_used_when_no_internet(self):
response_mock.raise_for_status.side_effect = HTTPError
response_mock.json.return_value = {}
orig_pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
- orig_comps = {
- k: v
- for k, v in orig_pipe.components.items() if hasattr(v, "parameters")
- }
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
+ orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
with mock.patch("requests.request", return_value=response_mock):
pipe = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch",
safety_checker=None,
- local_files_only=True, )
- comps = {
- k: v
- for k, v in pipe.components.items() if hasattr(v, "parameters")
- }
+ local_files_only=True,
+ )
+ comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
for m1, m2 in zip(orig_comps.values(), comps.values()):
for p1, p2 in zip(m1.parameters(), m2.parameters()):
if (p1 != p2).sum() > 0:
@@ -365,11 +348,11 @@ def test_download_from_variant_folder(self):
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = StableDiffusionPipeline.download(
"hf-internal-testing/stable-diffusion-all-variants",
- cache_dir=tmpdirname, )
+ cache_dir=tmpdirname,
+ )
all_root_files = [t[-1] for t in os.walk(tmpdirname)]
files = [item for sublist in all_root_files for item in sublist]
- assert (len(files) == 15
- ), f"We should only download 15 files, not {len(files)}"
+ assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
assert not any(f.endswith(other_format) for f in files)
assert not any(len(f.split(".")) == 3 for f in files)
ppdiffusers.utils.import_utils._safetensors_available = True
@@ -386,22 +369,15 @@ def test_download_variant_all(self):
StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-all-variants",
cache_dir=tmpdirname,
- variant=variant, )
+ variant=variant,
+ )
all_root_files = [
- t[-1]
- for t in os.walk(
- os.path.join(tmpdirname,
- os.listdir(tmpdirname)[0], "snapshots"))
+ t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))
]
files = [item for sublist in all_root_files for item in sublist]
- assert (len(files) == 15
- ), f"We should only download 15 files, not {len(files)}"
- assert (len([
- f for f in files if f.endswith(f"{variant}{this_format}")
- ]) == 4)
- assert not any(
- f.endswith(this_format) and
- not f.endswith(f"{variant}{this_format}") for f in files)
+ assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+ assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4
+ assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files)
assert not any(f.endswith(other_format) for f in files)
ppdiffusers.utils.import_utils._safetensors_available = True
@@ -417,21 +393,16 @@ def test_download_variant_partly(self):
tmpdirname = StableDiffusionPipeline.download(
"hf-internal-testing/stable-diffusion-all-variants",
cache_dir=tmpdirname,
- variant=variant, )
+ variant=variant,
+ )
all_root_files = [t[-1] for t in os.walk(tmpdirname)]
files = [item for sublist in all_root_files for item in sublist]
unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
- assert (len(files) == 15
- ), f"We should only download 15 files, not {len(files)}"
+ assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
- assert (len([
- f for f in files if f.endswith(f"{variant}{this_format}")
- ]) == 1)
- assert (sum(
- f.endswith(this_format) and
- not f.endswith(f"{variant}{this_format}")
- for f in files) == 3)
+ assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
+ assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
assert not any(f.endswith(other_format) for f in files)
ppdiffusers.utils.import_utils._safetensors_available = True
@@ -467,59 +438,52 @@ def test_local_save_load_index(self):
@require_torch
def test_text_inversion_download(self):
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
import torch
num_tokens = len(pipe.tokenizer)
# single token load local
with tempfile.TemporaryDirectory() as tmpdirname:
- ten = {"<*>": torch.ones((32, ))}
+ ten = {"<*>": torch.ones((32,))}
torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
pipe.load_textual_inversion(tmpdirname, from_diffusers=True)
token = pipe.tokenizer.convert_tokens_to_ids("<*>")
assert token == num_tokens, "Added token must be at spot `num_tokens`"
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
- == 32)
+ assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>"
prompt = "hey <*>"
- out = pipe(
- prompt, num_inference_steps=1, output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
assert out.shape == (1, 128, 128, 3)
# single token load local with weight name
ten = {"<**>": 2 * torch.ones((1, 32))}
torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
- pipe.load_textual_inversion(
- tmpdirname,
- weight_name="learned_embeds.bin",
- from_diffusers=True)
+ pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin", from_diffusers=True)
token = pipe.tokenizer.convert_tokens_to_ids("<**>")
assert token == num_tokens + 1, "Added token must be at spot `num_tokens`"
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
- == 64)
+ assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>"
prompt = "hey <**>"
- out = pipe(
- prompt, num_inference_steps=1, output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
assert out.shape == (1, 128, 128, 3)
# multi token load
ten = {
- "<***>": torch.cat([
- 3 * torch.ones((1, 32)),
- 4 * torch.ones((1, 32)),
- 5 * torch.ones((1, 32)),
- ])
+ "<***>": torch.cat(
+ [
+ 3 * torch.ones((1, 32)),
+ 4 * torch.ones((1, 32)),
+ 5 * torch.ones((1, 32)),
+ ]
+ )
}
torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
@@ -532,38 +496,31 @@ def test_text_inversion_download(self):
assert token == num_tokens + 2, "Added token must be at spot `num_tokens`"
assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`"
assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`"
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-3].sum().item()
- == 96)
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-2].sum().item()
- == 128)
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
- == 160)
- assert (pipe._maybe_convert_prompt("<***>", pipe.tokenizer) ==
- "<***> <***>_1 <***>_2")
+ assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+ assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+ assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+ assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2"
prompt = "hey <***>"
- out = pipe(
- prompt, num_inference_steps=1, output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
assert out.shape == (1, 128, 128, 3)
# multi token load a1111
ten = {
"string_to_param": {
- "*": torch.cat([
- 3 * torch.ones((1, 32)),
- 4 * torch.ones((1, 32)),
- 5 * torch.ones((1, 32)),
- ])
+ "*": torch.cat(
+ [
+ 3 * torch.ones((1, 32)),
+ 4 * torch.ones((1, 32)),
+ 5 * torch.ones((1, 32)),
+ ]
+ )
},
"name": "<****>",
}
torch.save(ten, os.path.join(tmpdirname, "a1111.bin"))
- pipe.load_textual_inversion(
- tmpdirname, weight_name="a1111.bin", from_diffusers=True)
+ pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin", from_diffusers=True)
token = pipe.tokenizer.convert_tokens_to_ids("<****>")
token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1")
@@ -572,21 +529,13 @@ def test_text_inversion_download(self):
assert token == num_tokens + 5, "Added token must be at spot `num_tokens`"
assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`"
assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`"
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-3].sum().item()
- == 96)
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-2].sum().item()
- == 128)
- assert (
- pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
- == 160)
- assert (pipe._maybe_convert_prompt("<****>", pipe.tokenizer) ==
- "<****> <****>_1 <****>_2")
+ assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+ assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+ assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+ assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2"
prompt = "hey <****>"
- out = pipe(
- prompt, num_inference_steps=1, output_type="numpy").images
+ out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
assert out.shape == (1, 128, 128, 3)
def test_download_ignore_files(self):
@@ -595,20 +544,16 @@ def test_download_ignore_files(self):
# pipeline has Flax weights
tmpdirname = DiffusionPipeline.download(
"hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files",
- cache_dir=tmpdirname, )
+ cache_dir=tmpdirname,
+ )
files = []
for root, ds, fs in os.walk(tmpdirname):
for f in fs:
- str_path = str(os.path.join(root, f)).replace(
- str(tmpdirname) + "/", "")
+ str_path = str(os.path.join(root, f)).replace(str(tmpdirname) + "/", "")
files.append(str_path)
# None of the downloaded files should be a pytorch file even if we have some here:
# https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
- assert not any(f in files
- for f in [
- "vae/diffusion_pytorch_model.bin",
- "text_encoder/config.json"
- ])
+ assert not any(f in files for f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"])
assert len(files) == 13
@@ -616,7 +561,8 @@ class CustomPipelineTests(unittest.TestCase):
def test_load_custom_pipeline(self):
pipeline = DiffusionPipeline.from_pretrained(
"google/ddpm-cifar10-32",
- custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", )
+ custom_pipeline="junnyu/ppdiffusers-dummy-pipeline",
+ )
pipeline = pipeline
assert pipeline.__class__.__name__ == "CustomPipeline"
@@ -644,7 +590,8 @@ def test_load_custom_pipeline(self):
def test_run_custom_pipeline(self):
pipeline = DiffusionPipeline.from_pretrained(
"google/ddpm-cifar10-32",
- custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", )
+ custom_pipeline="junnyu/ppdiffusers-dummy-pipeline",
+ )
pipeline = pipeline
images, output_str = pipeline(num_inference_steps=2, output_type="np")
assert images[0].shape == (1, 32, 32, 3)
@@ -653,8 +600,8 @@ def test_run_custom_pipeline(self):
def test_local_custom_pipeline_repo(self):
local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
pipeline = DiffusionPipeline.from_pretrained(
- "google/ddpm-cifar10-32",
- custom_pipeline=local_custom_pipeline_path)
+ "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+ )
pipeline = pipeline
images, output_str = pipeline(num_inference_steps=2, output_type="np")
assert pipeline.__class__.__name__ == "CustomLocalPipeline"
@@ -663,11 +610,10 @@ def test_local_custom_pipeline_repo(self):
def test_local_custom_pipeline_file(self):
local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
- local_custom_pipeline_path = os.path.join(local_custom_pipeline_path,
- "what_ever.py")
+ local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
pipeline = DiffusionPipeline.from_pretrained(
- "google/ddpm-cifar10-32",
- custom_pipeline=local_custom_pipeline_path)
+ "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+ )
pipeline = pipeline
images, output_str = pipeline(num_inference_steps=2, output_type="np")
assert pipeline.__class__.__name__ == "CustomLocalPipeline"
@@ -678,13 +624,13 @@ def test_local_custom_pipeline_file(self):
@require_paddle_gpu
def test_download_from_git(self):
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
- feature_extractor = CLIPImageProcessor.from_pretrained(
- clip_model_id, from_hf_hub=False)
+ feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id, from_hf_hub=False)
clip_model = CLIPModel.from_pretrained(
clip_model_id,
paddle_dtype=paddle.float16,
from_hf_hub=False,
- from_diffusers=False, )
+ from_diffusers=False,
+ )
pipeline = DiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
custom_pipeline="clip_guided_stable_diffusion",
@@ -692,17 +638,17 @@ def test_download_from_git(self):
feature_extractor=feature_extractor,
paddle_dtype=paddle.float16,
from_hf_hub=False,
- from_diffusers=False, )
+ from_diffusers=False,
+ )
pipeline.enable_attention_slicing()
assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion"
- image = pipeline(
- "a prompt", num_inference_steps=2, output_type="np").images[0]
+ image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
assert image.shape == (512, 512, 3)
def test_save_pipeline_change_config(self):
pipe = DiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- safety_checker=None)
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+ )
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
@@ -710,8 +656,7 @@ def test_save_pipeline_change_config(self):
assert pipe.scheduler.__class__.__name__ == "PNDMScheduler"
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- pipe.scheduler.config)
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.save_pretrained(tmpdirname)
pipe = DiffusionPipeline.from_pretrained(tmpdirname)
@@ -732,8 +677,7 @@ def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = 32, 32
- image = floats_tensor(
- (batch_size, num_channels) + sizes, rng=random.Random(0))
+ image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
return image
def dummy_uncond_unet(self, sample_size=32):
@@ -745,7 +689,8 @@ def dummy_uncond_unet(self, sample_size=32):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
return model
def dummy_cond_unet(self, sample_size=32):
@@ -758,7 +703,8 @@ def dummy_cond_unet(self, sample_size=32):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32, )
+ cross_attention_dim=32,
+ )
return model
@property
@@ -770,7 +716,8 @@ def dummy_vae(self):
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4, )
+ latent_channels=4,
+ )
return model
@property
@@ -785,7 +732,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
@property
@@ -803,24 +751,21 @@ def to(self, device):
return extract
- @parameterized.expand([
- [DDIMScheduler, DDIMPipeline, 32],
- [DDPMScheduler, DDPMPipeline, 32],
- [DDIMScheduler, DDIMPipeline, (32, 64)],
- [DDPMScheduler, DDPMPipeline, (64, 32)],
- ])
- def test_uncond_unet_components(self,
- scheduler_fn=DDPMScheduler,
- pipeline_fn=DDPMPipeline,
- sample_size=32):
+ @parameterized.expand(
+ [
+ [DDIMScheduler, DDIMPipeline, 32],
+ [DDPMScheduler, DDPMPipeline, 32],
+ [DDIMScheduler, DDIMPipeline, (32, 64)],
+ [DDPMScheduler, DDPMPipeline, (64, 32)],
+ ]
+ )
+ def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
unet = self.dummy_uncond_unet(sample_size)
scheduler = scheduler_fn()
pipeline = pipeline_fn(unet, scheduler)
generator = paddle.Generator().manual_seed(0)
- out_image = pipeline(
- generator=generator, num_inference_steps=2, output_type="np").images
- sample_size = ((sample_size, sample_size)
- if isinstance(sample_size, int) else sample_size)
+ out_image = pipeline(generator=generator, num_inference_steps=2, output_type="np").images
+ sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
assert out_image.shape == (1, *sample_size, 3)
def test_stable_diffusion_components(self):
@@ -829,13 +774,10 @@ def test_stable_diffusion_components(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image = self.dummy_image().cpu().transpose(perm=[0, 2, 3, 1])[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB")
- mask_image = (
- Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
- (32, 32)))
+ mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
inpaint = StableDiffusionInpaintPipelineLegacy(
unet=unet,
scheduler=scheduler,
@@ -843,7 +785,8 @@ def test_stable_diffusion_components(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
img2img = StableDiffusionImg2ImgPipeline(**inpaint.components)
text2img = StableDiffusionPipeline(**inpaint.components)
prompt = "A painting of a squirrel eating a burger"
@@ -854,18 +797,16 @@ def test_stable_diffusion_components(self):
num_inference_steps=2,
output_type="np",
image=init_image,
- mask_image=mask_image, ).images
+ mask_image=mask_image,
+ ).images
image_img2img = img2img(
[prompt],
generator=generator,
num_inference_steps=2,
output_type="np",
- image=init_image, ).images
- image_text2img = text2img(
- [prompt],
- generator=generator,
- num_inference_steps=2,
- output_type="np").images
+ image=init_image,
+ ).images
+ image_text2img = text2img([prompt], generator=generator, num_inference_steps=2, output_type="np").images
assert image_inpaint.shape == (1, 32, 32, 3)
assert image_img2img.shape == (1, 32, 32, 3)
assert image_text2img.shape == (1, 64, 64, 3)
@@ -875,8 +816,7 @@ def test_set_scheduler(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
@@ -884,7 +824,8 @@ def test_set_scheduler(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
assert isinstance(sd.scheduler, DDIMScheduler)
sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
@@ -895,11 +836,9 @@ def test_set_scheduler(self):
assert isinstance(sd.scheduler, LMSDiscreteScheduler)
sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
assert isinstance(sd.scheduler, EulerDiscreteScheduler)
- sd.scheduler = EulerAncestralDiscreteScheduler.from_config(
- sd.scheduler.config)
+ sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
- sd.scheduler = DPMSolverMultistepScheduler.from_config(
- sd.scheduler.config)
+ sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
def test_set_component_to_none(self):
@@ -907,8 +846,7 @@ def test_set_component_to_none(self):
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
pipeline = StableDiffusionPipeline(
unet=unet,
@@ -917,7 +855,8 @@ def test_set_component_to_none(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
generator = paddle.Generator().manual_seed(0)
@@ -927,7 +866,8 @@ def test_set_component_to_none(self):
prompt=prompt,
generator=generator,
num_inference_steps=1,
- output_type="np", ).images
+ output_type="np",
+ ).images
pipeline.feature_extractor = None
generator = paddle.Generator().manual_seed(0)
@@ -935,23 +875,19 @@ def test_set_component_to_none(self):
prompt=prompt,
generator=generator,
num_inference_steps=1,
- output_type="np", ).images
+ output_type="np",
+ ).images
assert out_image.shape == (1, 64, 64, 3)
assert np.abs(out_image - out_image_2).max() < 1e-3
def test_set_scheduler_consistency(self):
unet = self.dummy_cond_unet()
- pndm = PNDMScheduler.from_config(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
- ddim = DDIMScheduler.from_config(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+ ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd = StableDiffusionPipeline(
unet=unet,
scheduler=pndm,
@@ -959,15 +895,13 @@ def test_set_scheduler_consistency(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
pndm_config = sd.scheduler.config
sd.scheduler = DDPMScheduler.from_config(pndm_config)
sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
pndm_config_2 = sd.scheduler.config
- pndm_config_2 = {
- k: v
- for k, v in pndm_config_2.items() if k in pndm_config
- }
+ pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
assert dict(pndm_config) == dict(pndm_config_2)
sd = StableDiffusionPipeline(
unet=unet,
@@ -976,40 +910,33 @@ def test_set_scheduler_consistency(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
ddim_config = sd.scheduler.config
sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
ddim_config_2 = sd.scheduler.config
- ddim_config_2 = {
- k: v
- for k, v in ddim_config_2.items() if k in ddim_config
- }
+ ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
assert dict(ddim_config) == dict(ddim_config_2)
def test_save_safe_serialization(self):
pipeline = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch",
from_hf_hub=True,
- from_diffusers=True, )
+ from_diffusers=True,
+ )
with tempfile.TemporaryDirectory() as tmpdirname:
- pipeline.save_pretrained(
- tmpdirname, safe_serialization=True, to_diffusers=True)
- vae_path = os.path.join(tmpdirname, "vae",
- "diffusion_pytorch_model.safetensors")
+ pipeline.save_pretrained(tmpdirname, safe_serialization=True, to_diffusers=True)
+ vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors")
assert os.path.exists(vae_path), f"Could not find {vae_path}"
_ = safetensors.torch.load_file(vae_path)
- unet_path = os.path.join(tmpdirname, "unet",
- "diffusion_pytorch_model.safetensors")
+ unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors")
assert os.path.exists(unet_path), f"Could not find {unet_path}"
_ = safetensors.torch.load_file(unet_path)
- text_encoder_path = os.path.join(tmpdirname, "text_encoder",
- "model.safetensors")
- assert os.path.exists(
- text_encoder_path), f"Could not find {text_encoder_path}"
+ text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors")
+ assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}"
_ = safetensors.torch.load_file(text_encoder_path)
- pipeline = StableDiffusionPipeline.from_pretrained(
- tmpdirname, from_diffusers=True)
+ pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=True)
assert pipeline.unet is not None
assert pipeline.vae is not None
assert pipeline.text_encoder is not None
@@ -1020,17 +947,17 @@ def test_no_pytorch_download_when_doing_safetensors(self):
with tempfile.TemporaryDirectory() as tmpdirname:
_ = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/diffusers-stable-diffusion-tiny-all",
- cache_dir=tmpdirname, )
+ cache_dir=tmpdirname,
+ )
path = os.path.join(
tmpdirname,
"models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
"snapshots",
"07838d72e12f9bcec1375b0482b80c1d399be843",
- "unet", )
- assert os.path.exists(
- os.path.join(path, "diffusion_pytorch_model.safetensors"))
- assert not os.path.exists(
- os.path.join(path, "diffusion_pytorch_model.bin"))
+ "unet",
+ )
+ assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+ assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
def test_no_safetensors_download_when_doing_pytorch(self):
import ppdiffusers
@@ -1039,28 +966,25 @@ def test_no_safetensors_download_when_doing_pytorch(self):
with tempfile.TemporaryDirectory() as tmpdirname:
_ = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/diffusers-stable-diffusion-tiny-all",
- cache_dir=tmpdirname, )
+ cache_dir=tmpdirname,
+ )
path = os.path.join(
tmpdirname,
"models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
"snapshots",
"07838d72e12f9bcec1375b0482b80c1d399be843",
- "unet", )
- assert not os.path.exists(
- os.path.join(path, "diffusion_pytorch_model.safetensors"))
- assert os.path.exists(
- os.path.join(path, "diffusion_pytorch_model.bin"))
+ "unet",
+ )
+ assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+ assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
ppdiffusers.utils.import_utils._safetensors_available = True
def test_optional_components(self):
unet = self.dummy_cond_unet()
- pndm = PNDMScheduler.from_config(
- "hf-internal-testing/tiny-stable-diffusion-torch",
- subfolder="scheduler")
+ pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
vae = self.dummy_vae
bert = self.dummy_text_encoder
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
sd = StableDiffusionPipeline(
unet=unet,
scheduler=pndm,
@@ -1068,7 +992,8 @@ def test_optional_components(self):
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=unet,
- feature_extractor=self.dummy_extractor, )
+ feature_extractor=self.dummy_extractor,
+ )
assert sd.config.requires_safety_checker is True
with tempfile.TemporaryDirectory() as tmpdirname:
sd.save_pretrained(tmpdirname)
@@ -1076,7 +1001,8 @@ def test_optional_components(self):
tmpdirname,
feature_extractor=None,
safety_checker=None,
- requires_safety_checker=False, )
+ requires_safety_checker=False,
+ )
assert sd.config.requires_safety_checker is False
assert sd.config.safety_checker == (None, None)
assert sd.config.feature_extractor == (None, None)
@@ -1092,8 +1018,7 @@ def test_optional_components(self):
config["safety_checker"] = [None, None]
with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
json.dump(config, f)
- sd = StableDiffusionPipeline.from_pretrained(
- tmpdirname, requires_safety_checker=False)
+ sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
sd.save_pretrained(tmpdirname)
sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
assert sd.config.requires_safety_checker is False
@@ -1110,8 +1035,7 @@ def test_optional_components(self):
assert sd.config.safety_checker == (None, None)
assert sd.config.feature_extractor == (None, None)
sd.save_pretrained(tmpdirname)
- sd = StableDiffusionPipeline.from_pretrained(
- tmpdirname, feature_extractor=self.dummy_extractor)
+ sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
assert sd.config.requires_safety_checker is False
assert sd.config.safety_checker == (None, None)
assert sd.config.feature_extractor != (None, None)
@@ -1119,13 +1043,13 @@ def test_optional_components(self):
tmpdirname,
feature_extractor=self.dummy_extractor,
safety_checker=unet,
- requires_safety_checker=[True, True], )
+ requires_safety_checker=[True, True],
+ )
assert sd.config.requires_safety_checker == [True, True]
assert sd.config.safety_checker != (None, None)
assert sd.config.feature_extractor != (None, None)
sd.save_pretrained(tmpdirname)
- sd = StableDiffusionPipeline.from_pretrained(
- tmpdirname, feature_extractor=self.dummy_extractor)
+ sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
assert sd.config.requires_safety_checker == [True, True]
assert sd.config.safety_checker != (None, None)
assert sd.config.feature_extractor != (None, None)
@@ -1146,42 +1070,28 @@ def tearDown(self):
def test_smart_download(self):
model_id = "hf-internal-testing/unet-pipeline-dummy"
with tempfile.TemporaryDirectory() as tmpdirname:
- _ = DiffusionPipeline.from_pretrained(
- model_id, cache_dir=tmpdirname, force_download=True)
+ _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
local_repo_name = "--".join(["models"] + model_id.split("/"))
- snapshot_dir = os.path.join(tmpdirname, local_repo_name,
- "snapshots")
- snapshot_dir = os.path.join(snapshot_dir,
- os.listdir(snapshot_dir)[0])
- assert os.path.isfile(
- os.path.join(snapshot_dir, DiffusionPipeline.config_name))
+ snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
+ snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
+ assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name))
assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME))
- assert os.path.isfile(
- os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
- assert os.path.isfile(
- os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME))
- assert os.path.isfile(
- os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
- assert os.path.isfile(
- os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
- assert os.path.isfile(
- os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
- assert not os.path.isfile(
- os.path.join(snapshot_dir, "big_array.npy"))
+ assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
+ assert os.path.isfile(os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME))
+ assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
+ assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
+ assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
+ assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
def test_warning_unused_kwargs(self):
model_id = "hf-internal-testing/unet-pipeline-dummy"
logger = logging.get_logger("ppdiffusers.pipelines")
with tempfile.TemporaryDirectory() as tmpdirname:
with CaptureLogger(logger) as cap_logger:
- DiffusionPipeline.from_pretrained(
- model_id,
- not_used=True,
- cache_dir=tmpdirname,
- force_download=True)
+ DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True)
assert (
- cap_logger.out.strip().split("\n")[-1] ==
- "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
+ cap_logger.out.strip().split("\n")[-1]
+ == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
)
def test_from_save_pretrained(self):
@@ -1192,7 +1102,8 @@ def test_from_save_pretrained(self):
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
scheduler = DDPMScheduler(num_train_timesteps=10)
ddpm = DDPMPipeline(model, scheduler)
ddpm.set_progress_bar_config(disable=None)
@@ -1202,59 +1113,41 @@ def test_from_save_pretrained(self):
new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
generator = paddle.Generator().manual_seed(0)
- image = ddpm(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
+ image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
- new_image = new_ddpm(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
+ new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
- assert (np.abs(image - new_image).sum() < 1e-5
- ), "Models don't give the same forward pass"
+ assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
def test_from_pretrained_hub(self):
model_path = "google/ddpm-cifar10-32"
scheduler = DDPMScheduler(num_train_timesteps=10)
ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
ddpm.set_progress_bar_config(disable=None)
- ddpm_from_hub = DiffusionPipeline.from_pretrained(
- model_path, scheduler=scheduler)
+ ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
ddpm_from_hub = ddpm_from_hub
ddpm_from_hub.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ddpm(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
+ image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
- new_image = ddpm_from_hub(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
- assert (np.abs(image - new_image).sum() < 1e-05
- ), "Models don't give the same forward pass"
+ new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
+ assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
def test_from_pretrained_hub_pass_model(self):
model_path = "google/ddpm-cifar10-32"
scheduler = DDPMScheduler(num_train_timesteps=10)
unet = UNet2DModel.from_pretrained(model_path)
- ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(
- model_path, unet=unet, scheduler=scheduler)
+ ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
ddpm_from_hub_custom_model = ddpm_from_hub_custom_model
ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
- ddpm_from_hub = DiffusionPipeline.from_pretrained(
- model_path, scheduler=scheduler)
+ ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
- image = ddpm_from_hub_custom_model(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
+ image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images
generator = paddle.Generator().manual_seed(0)
- new_image = ddpm_from_hub(
- generator=generator, num_inference_steps=5,
- output_type="numpy").images
- assert (np.abs(image - new_image).sum() < 1e-05
- ), "Models don't give the same forward pass"
+ new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
+ assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
def test_output_format(self):
model_path = "google/ddpm-cifar10-32"
@@ -1292,8 +1185,7 @@ def test_ddpm_ddim_equality_batched(self):
ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
ddim.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(seed)
- ddpm_images = ddpm(
- batch_size=2, generator=generator, output_type="numpy").images
+ ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images
generator = paddle.Generator().manual_seed(seed)
ddim_images = ddim(
batch_size=2,
@@ -1301,5 +1193,6 @@ def test_ddpm_ddim_equality_batched(self):
num_inference_steps=1000,
eta=1.0,
output_type="numpy",
- use_clipped_model_output=True, ).images
+ use_clipped_model_output=True,
+ ).images
assert np.abs(ddpm_images - ddim_images).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/test_pipelines_common.py b/ppdiffusers/tests/pipelines/test_pipelines_common.py
index 5b09ecc71d187..c92b77174f7dc 100644
--- a/ppdiffusers/tests/pipelines/test_pipelines_common.py
+++ b/ppdiffusers/tests/pipelines/test_pipelines_common.py
@@ -48,16 +48,18 @@ class PipelineTesterMixin:
# Canonical parameters that are passed to `__call__` regardless
# of the type of pipeline. They are always optional and have common
# sense default values.
- required_optional_params = frozenset([
- "num_inference_steps",
- "num_images_per_prompt",
- "generator",
- "latents",
- "output_type",
- "return_dict",
- "callback",
- "callback_steps",
- ])
+ required_optional_params = frozenset(
+ [
+ "num_inference_steps",
+ "num_images_per_prompt",
+ "generator",
+ "latents",
+ "output_type",
+ "return_dict",
+ "callback",
+ "callback_steps",
+ ]
+ )
num_inference_steps_args = ["num_inference_steps"]
test_attention_slicing = True
test_cpu_offload = False
@@ -95,7 +97,8 @@ def params(self) -> frozenset:
"do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline "
"with non-configurable height and width arguments should set the attribute as "
"`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. "
- "See existing pipeline tests for reference.")
+ "See existing pipeline tests for reference."
+ )
@property
def batch_params(self) -> frozenset:
@@ -108,7 +111,8 @@ def batch_params(self) -> frozenset:
"do not make modifications to the existing common sets of batch arguments. I.e. a text to "
"image pipeline `negative_prompt` is not batched should set the attribute as "
"`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. "
- "See existing pipeline tests for reference.")
+ "See existing pipeline tests for reference."
+ )
def tearDown(self):
super().tearDown()
@@ -123,8 +127,7 @@ def test_save_load_local(self):
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir, to_diffusers=False)
- pipe_loaded = self.pipeline_class.from_pretrained(
- tmpdir, from_diffusers=False)
+ pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
pipe_loaded.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output_loaded = pipe_loaded(**inputs)[0]
@@ -134,7 +137,8 @@ def test_save_load_local(self):
def test_pipeline_call_signature(self):
self.assertTrue(
hasattr(self.pipeline_class, "__call__"),
- f"{self.pipeline_class} should have a `__call__` method", )
+ f"{self.pipeline_class} should have a `__call__` method",
+ )
parameters = inspect.signature(self.pipeline_class.__call__).parameters
@@ -146,9 +150,7 @@ def test_pipeline_call_signature(self):
parameters = set(parameters.keys())
parameters.remove("self")
- parameters.discard(
- "kwargs"
- ) # kwargs can be added if arguments of pipeline call function are deprecated
+ parameters.discard("kwargs") # kwargs can be added if arguments of pipeline call function are deprecated
remaining_required_parameters = set()
@@ -176,9 +178,10 @@ def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]):
self._test_inference_batch_consistent(batch_sizes=batch_sizes)
def _test_inference_batch_consistent(
- self,
- batch_sizes=[2, 4, 13],
- additional_params_copy_to_batched_inputs=["num_inference_steps"], ):
+ self,
+ batch_sizes=[2, 4, 13],
+ additional_params_copy_to_batched_inputs=["num_inference_steps"],
+ ):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
@@ -191,10 +194,7 @@ def _test_inference_batch_consistent(
if name in self.batch_params:
if name == "prompt":
len_prompt = len(value)
- batched_inputs[name] = [
- value[:len_prompt // i]
- for i in range(1, batch_size + 1)
- ]
+ batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
batched_inputs[name][-1] = 2000 * "very long"
else:
batched_inputs[name] = batch_size * [value]
@@ -220,13 +220,14 @@ def test_inference_batch_single_identical(self, batch_size=3):
self._test_inference_batch_single_identical(batch_size=batch_size)
def _test_inference_batch_single_identical(
- self,
- batch_size=3,
- test_max_difference=None,
- test_mean_pixel_difference=None,
- relax_max_difference=False,
- expected_max_diff=1e-4,
- additional_params_copy_to_batched_inputs=["num_inference_steps"], ):
+ self,
+ batch_size=3,
+ test_max_difference=None,
+ test_mean_pixel_difference=None,
+ relax_max_difference=False,
+ expected_max_diff=1e-4,
+ additional_params_copy_to_batched_inputs=["num_inference_steps"],
+ ):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
@@ -240,19 +241,14 @@ def _test_inference_batch_single_identical(
if name in self.batch_params:
if name == "prompt":
len_prompt = len(value)
- batched_inputs[name] = [
- value[:len_prompt // i]
- for i in range(1, batch_size + 1)
- ]
+ batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
batched_inputs[name][-1] = 2000 * "very long"
else:
batched_inputs[name] = batch_size * [value]
elif name == "batch_size":
batched_inputs[name] = batch_size
elif name == "generator":
- batched_inputs[name] = [
- self.get_generator(i) for i in range(batch_size)
- ]
+ batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
else:
batched_inputs[name] = value
@@ -293,8 +289,7 @@ def test_components_function(self):
init_components = self.get_dummy_components()
pipe = self.pipeline_class(**init_components)
self.assertTrue(hasattr(pipe, "components"))
- self.assertTrue(
- set(pipe.components.keys()) == set(init_components.keys()))
+ self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
def test_float16_inference(self, expected_max_diff=1e-2):
self._test_float16_inference(expected_max_diff)
@@ -312,7 +307,8 @@ def _test_float16_inference(self, expected_max_diff=1e-2):
self.assertLess(
max_diff,
expected_max_diff,
- "The outputs of the fp16 and fp32 pipelines are too different.", )
+ "The outputs of the fp16 and fp32 pipelines are too different.",
+ )
def test_save_load_float16(self, expected_max_diff=1e-2):
self._test_save_load_float16(expected_max_diff)
@@ -360,8 +356,7 @@ def test_save_load_optional_components(self):
with tempfile.TemporaryDirectory() as tmpdir:
# TODO check this
pipe.save_pretrained(tmpdir, to_diffusers=False)
- pipe_loaded = self.pipeline_class.from_pretrained(
- tmpdir, from_diffusers=False)
+ pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
pipe_loaded.set_progress_bar_config(disable=None)
for optional_component in pipe._optional_components:
self.assertTrue(
@@ -394,27 +389,22 @@ def test_to_dtype(self):
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
- model_dtypes = [
- component.dtype for component in components.values()
- if hasattr(component, "dtype")
- ]
+ model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == paddle.float32 for dtype in model_dtypes))
pipe.to(paddle_dtype=paddle.float16)
- model_dtypes = [
- component.dtype for component in components.values()
- if hasattr(component, "dtype")
- ]
+ model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == paddle.float16 for dtype in model_dtypes))
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass()
def _test_attention_slicing_forward_pass(
- self,
- test_max_difference=True,
- test_mean_pixel_difference=True,
- expected_max_diff=5e-3, ):
+ self,
+ test_max_difference=True,
+ test_mean_pixel_difference=True,
+ expected_max_diff=5e-3,
+ ):
if not self.test_attention_slicing:
return
@@ -427,25 +417,24 @@ def _test_attention_slicing_forward_pass(
inputs = self.get_dummy_inputs()
output_with_slicing = pipe(**inputs)[0]
if test_max_difference:
- max_diff = np.abs(
- to_np(output_with_slicing) - to_np(output_without_slicing)).max(
- )
+ max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
self.assertLess(
max_diff,
expected_max_diff,
- "Attention slicing should not affect the inference results", )
+ "Attention slicing should not affect the inference results",
+ )
if test_mean_pixel_difference:
- assert_mean_pixel_difference(output_with_slicing[0],
- output_without_slicing[0])
+ assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0])
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass()
def _test_xformers_attention_forwardGenerator_pass(
- self,
- test_max_difference=True,
- test_mean_pixel_difference=True,
- expected_max_diff=1e-2, ):
+ self,
+ test_max_difference=True,
+ test_mean_pixel_difference=True,
+ expected_max_diff=1e-2,
+ ):
if not self.test_xformers_attention:
return
components = self.get_dummy_components()
@@ -461,15 +450,14 @@ def _test_xformers_attention_forwardGenerator_pass(
output_with_xformers = output_with_xformers.numpy()
if hasattr(output_without_xformers, "numpy"):
output_without_xformers = output_without_xformers.numpy()
- max_diff = np.abs(output_with_xformers -
- output_without_xformers).max()
+ max_diff = np.abs(output_with_xformers - output_without_xformers).max()
self.assertLess(
max_diff,
expected_max_diff,
- "XFormers attention should not affect the inference results", )
+ "XFormers attention should not affect the inference results",
+ )
if test_mean_pixel_difference:
- assert_mean_pixel_difference(output_with_xformers[0],
- output_without_xformers[0])
+ assert_mean_pixel_difference(output_with_xformers[0], output_without_xformers[0])
def test_progress_bar(self):
components = self.get_dummy_components()
@@ -482,12 +470,12 @@ def test_progress_bar(self):
self.assertTrue(max_steps is not None and len(max_steps) > 0)
self.assertTrue(
f"{max_steps}/{max_steps}" in stderr,
- "Progress bar should be enabled and stopped at the max step", )
+ "Progress bar should be enabled and stopped at the max step",
+ )
pipe.set_progress_bar_config(disable=True)
with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
_ = pipe(**inputs)
- self.assertTrue(stderr.getvalue() == "",
- "Progress bar should be disabled")
+ self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
def test_num_images_per_prompt(self):
sig = inspect.signature(self.pipeline_class.__call__)
@@ -510,17 +498,13 @@ def test_num_images_per_prompt(self):
if key in self.batch_params:
inputs[key] = batch_size * [inputs[key]]
- images = pipe(
- **inputs,
- num_images_per_prompt=num_images_per_prompt).images
+ images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
assert images.shape[0] == batch_size * num_images_per_prompt
def assert_mean_pixel_difference(image, expected_image):
- image = np.asarray(
- DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
- expected_image = np.asarray(
- DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
+ image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
+ expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
avg_diff = np.abs(image - expected_image).mean()
assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
index 23825d0855c71..b6cb10d5a3545 100644
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
@@ -19,9 +19,13 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
- DPMSolverMultistepScheduler, TextToVideoSDPipeline,
- UNet3DConditionModel)
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ TextToVideoSDPipeline,
+ UNet3DConditionModel,
+)
from ppdiffusers.utils import load_numpy, slow
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -32,14 +36,16 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = TextToVideoSDPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
- required_optional_params = frozenset([
- "num_inference_steps",
- "generator",
- "latents",
- "return_dict",
- "callback",
- "callback_steps",
- ])
+ required_optional_params = frozenset(
+ [
+ "num_inference_steps",
+ "generator",
+ "latents",
+ "return_dict",
+ "callback",
+ "callback_steps",
+ ]
+ )
def get_dummy_components(self):
paddle.seed(0)
@@ -53,20 +59,24 @@ def get_dummy_components(self):
"CrossAttnDownBlock3D",
"CrossAttnDownBlock3D",
"CrossAttnDownBlock3D",
- "DownBlock3D", ),
+ "DownBlock3D",
+ ),
up_block_types=(
"UpBlock3D",
"CrossAttnUpBlock3D",
"CrossAttnUpBlock3D",
- "CrossAttnUpBlock3D", ),
+ "CrossAttnUpBlock3D",
+ ),
cross_attention_dim=32,
- attention_head_dim=4, )
+ attention_head_dim=4,
+ )
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
- set_alpha_to_one=False, )
+ set_alpha_to_one=False,
+ )
paddle.seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
@@ -75,7 +85,8 @@ def get_dummy_components(self):
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
- sample_size=128, )
+ sample_size=128,
+ )
paddle.seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
@@ -88,10 +99,10 @@ def get_dummy_components(self):
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
- projection_dim=512, )
+ projection_dim=512,
+ )
text_encoder = CLIPTextModel(text_encoder_config)
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
@@ -128,28 +139,20 @@ def test_text_to_video_default_case(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
def test_xformers_attention_forwardGenerator_pass(self):
- self._test_xformers_attention_forwardGenerator_pass(
- test_mean_pixel_difference=False)
+ self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
def test_attention_slicing_forward_pass(self):
- self._test_attention_slicing_forward_pass(
- test_mean_pixel_difference=False)
+ self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
- @unittest.skip(
- reason="Batching needs to be properly figured out first for this pipeline."
- )
+ @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_consistent(self):
pass
- @unittest.skip(
- reason="Batching needs to be properly figured out first for this pipeline."
- )
+ @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_single_identical(self):
pass
- @unittest.skip(
- reason="`num_images_per_prompt` argument is not supported for this pipeline."
- )
+ @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
def test_num_images_per_prompt(self):
pass
@@ -161,19 +164,13 @@ def test_full_model(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
)
pipe = TextToVideoSDPipeline.from_pretrained(
- "damo-vilab/text-to-video-ms-1.7b",
- from_hf_hub=True,
- from_diffusers=True)
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(
- pipe.scheduler.config)
+ "damo-vilab/text-to-video-ms-1.7b", from_hf_hub=True, from_diffusers=True
+ )
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe
prompt = "Spiderman is surfing"
generator = paddle.Generator().manual_seed(0)
- video_frames = pipe(
- prompt,
- generator=generator,
- num_inference_steps=25,
- output_type="pd").frames
+ video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pd").frames
video = video_frames.cpu().numpy()
assert np.abs(expected_video - video).mean() < 0.8
@@ -181,15 +178,10 @@ def test_two_step_model(self):
expected_video = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
)
- pipe = TextToVideoSDPipeline.from_pretrained(
- "damo-vilab/text-to-video-ms-1.7b")
+ pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
pipe = pipe
prompt = "Spiderman is surfing"
generator = paddle.Generator().manual_seed(0)
- video_frames = pipe(
- prompt,
- generator=generator,
- num_inference_steps=2,
- output_type="pd").frames
+ video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pd").frames
video = video_frames.cpu().numpy()
assert np.abs(expected_video - video).mean() < 0.8
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
index 8387b54267696..121798ea45e07 100644
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
@@ -27,8 +27,7 @@
class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
def test_full_model(self):
model_id = "runwayml/stable-diffusion-v1-5"
- pipe = TextToVideoZeroPipeline.from_pretrained(
- model_id, torch_dtype="float16")
+ pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype="float16")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
generator = paddle.Generator().manual_seed(0)
prompt = "A bear is playing a guitar on Times Square"
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip.py b/ppdiffusers/tests/pipelines/unclip/test_unclip.py
index 3f0b1a190c645..3e8d64094abd3 100644
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip.py
+++ b/ppdiffusers/tests/pipelines/unclip/test_unclip.py
@@ -18,18 +18,25 @@
import numpy as np
import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection,
- CLIPTokenizer)
-
-from ppdiffusers import (PriorTransformer, UnCLIPPipeline, UnCLIPScheduler,
- UNet2DConditionModel, UNet2DModel)
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
+
+from ppdiffusers import (
+ PriorTransformer,
+ UnCLIPPipeline,
+ UnCLIPScheduler,
+ UNet2DConditionModel,
+ UNet2DModel,
+)
from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from ppdiffusers.utils import slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (PipelineTesterMixin,
- assert_mean_pixel_difference)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@@ -44,13 +51,15 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
"cross_attention_kwargs",
}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
- required_optional_params = frozenset([
- "generator",
- "return_dict",
- "prior_num_inference_steps",
- "decoder_num_inference_steps",
- "super_res_num_inference_steps",
- ])
+ required_optional_params = frozenset(
+ [
+ "generator",
+ "return_dict",
+ "prior_num_inference_steps",
+ "decoder_num_inference_steps",
+ "super_res_num_inference_steps",
+ ]
+ )
test_xformers_attention = False
@property
@@ -75,8 +84,7 @@ def cross_attention_dim(self):
@property
def dummy_tokenizer(self):
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
@@ -92,7 +100,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModelWithProjection(config)
@property
@@ -127,13 +136,14 @@ def dummy_decoder(self):
"out_channels": 6,
"down_block_types": (
"ResnetDownsampleBlock2D",
- "SimpleCrossAttnDownBlock2D", ),
- "up_block_types":
- ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+ "SimpleCrossAttnDownBlock2D",
+ ),
+ "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
"block_out_channels": (
self.block_out_channels_0,
- self.block_out_channels_0 * 2, ),
+ self.block_out_channels_0 * 2,
+ ),
"layers_per_block": 1,
"cross_attention_dim": self.cross_attention_dim,
"attention_head_dim": 4,
@@ -148,13 +158,12 @@ def dummy_super_res_kwargs(self):
return {
"sample_size": 64,
"layers_per_block": 1,
- "down_block_types":
- ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
- "up_block_types":
- ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+ "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+ "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
"block_out_channels": (
self.block_out_channels_0,
- self.block_out_channels_0 * 2, ),
+ self.block_out_channels_0 * 2,
+ ),
"in_channels": 6,
"out_channels": 3,
}
@@ -183,15 +192,18 @@ def get_dummy_components(self):
variance_type="fixed_small_log",
prediction_type="sample",
num_train_timesteps=1000,
- clip_sample_range=5.0, )
+ clip_sample_range=5.0,
+ )
decoder_scheduler = UnCLIPScheduler(
variance_type="learned_range",
prediction_type="epsilon",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
super_res_scheduler = UnCLIPScheduler(
variance_type="fixed_small_log",
prediction_type="epsilon",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
components = {
"prior": prior,
"decoder": decoder,
@@ -229,20 +241,21 @@ def test_unclip(self):
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 2.6383996e-04,
- 9.9658674e-01,
- 1.1275411e-03,
- 2.6383996e-04,
- 2.6383996e-04,
- 9.9702907e-01,
- 9.9973619e-01,
- 9.9545717e-01,
- 2.6383996e-04,
- ])
+ expected_slice = np.array(
+ [
+ 2.6383996e-04,
+ 9.9658674e-01,
+ 1.1275411e-03,
+ 2.6383996e-04,
+ 2.6383996e-04,
+ 9.9702907e-01,
+ 9.9973619e-01,
+ 9.9545717e-01,
+ 2.6383996e-04,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_unclip_passed_text_embed(self):
class DummyScheduler:
@@ -264,29 +277,34 @@ class DummyScheduler:
dtype=dtype,
generator=generator,
latents=None,
- scheduler=DummyScheduler(), )
+ scheduler=DummyScheduler(),
+ )
shape = (
batch_size,
decoder.config.in_channels,
decoder.config.sample_size,
- decoder.config.sample_size, )
+ decoder.config.sample_size,
+ )
decoder_latents = pipe.prepare_latents(
shape,
dtype=dtype,
generator=generator,
latents=None,
- scheduler=DummyScheduler(), )
+ scheduler=DummyScheduler(),
+ )
shape = (
batch_size,
super_res_first.config.in_channels // 2,
super_res_first.config.sample_size,
- super_res_first.config.sample_size, )
+ super_res_first.config.sample_size,
+ )
super_res_latents = pipe.prepare_latents(
shape,
dtype=dtype,
generator=generator,
latents=None,
- scheduler=DummyScheduler(), )
+ scheduler=DummyScheduler(),
+ )
pipe.set_progress_bar_config(disable=None)
prompt = "this is a prompt example"
generator = paddle.Generator().manual_seed(0)
@@ -299,14 +317,16 @@ class DummyScheduler:
prior_latents=prior_latents,
decoder_latents=decoder_latents,
super_res_latents=super_res_latents,
- output_type="np", )
+ output_type="np",
+ )
image = output.images
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
return_attention_mask=True,
- return_tensors="pd", )
+ return_tensors="pd",
+ )
text_model_output = text_encoder(text_inputs.input_ids)
text_attention_mask = text_inputs.attention_mask
generator = paddle.Generator().manual_seed(0)
@@ -320,13 +340,13 @@ class DummyScheduler:
super_res_latents=super_res_latents,
text_model_output=text_model_output,
text_attention_mask=text_attention_mask,
- output_type="np", )[0]
+ output_type="np",
+ )[0]
assert np.abs(image - image_from_text).max() < 0.0001
def test_attention_slicing_forward_pass(self):
test_max_difference = False
- self._test_attention_slicing_forward_pass(
- test_max_difference=test_max_difference, expected_max_diff=0.01)
+ self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
def test_inference_batch_single_identical(self):
test_max_difference = False
@@ -365,8 +385,7 @@ def tearDown(self):
def test_unclip_karlo(self):
# Hard code image
- expected_image = np.array([[0.73281264, 0.69175875, 0.64672112],
- [0.71919304, 0.65395129, 0.60436499]])
+ expected_image = np.array([[0.73281264, 0.69175875, 0.64672112], [0.71919304, 0.65395129, 0.60436499]])
pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
pipeline.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
index 2bbb56cfad604..e09f906a7f87d 100644
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -20,32 +20,41 @@
import numpy as np
import paddle
from paddlenlp.transformers import (
- CLIPImageProcessor, CLIPTextConfig, CLIPTextModelWithProjection,
- CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection)
-
-from ppdiffusers import (DiffusionPipeline, UnCLIPImageVariationPipeline,
- UnCLIPScheduler, UNet2DConditionModel, UNet2DModel)
+ CLIPImageProcessor,
+ CLIPTextConfig,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionConfig,
+ CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+ DiffusionPipeline,
+ UnCLIPImageVariationPipeline,
+ UnCLIPScheduler,
+ UNet2DConditionModel,
+ UNet2DModel,
+)
from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from ppdiffusers.utils import floats_tensor, slow
from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu
-from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS,
- IMAGE_VARIATION_PARAMS)
-from ..test_pipelines_common import (PipelineTesterMixin,
- assert_mean_pixel_difference)
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin,
- unittest.TestCase):
+class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = UnCLIPImageVariationPipeline
params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
batch_params = IMAGE_VARIATION_BATCH_PARAMS
- required_optional_params = frozenset([
- "generator",
- "return_dict",
- "decoder_num_inference_steps",
- "super_res_num_inference_steps",
- ])
+ required_optional_params = frozenset(
+ [
+ "generator",
+ "return_dict",
+ "decoder_num_inference_steps",
+ "super_res_num_inference_steps",
+ ]
+ )
test_xformers_attention = False
@property
@@ -70,8 +79,7 @@ def cross_attention_dim(self):
@property
def dummy_tokenizer(self):
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
@@ -87,7 +95,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModelWithProjection(config)
@property
@@ -100,7 +109,8 @@ def dummy_image_encoder(self):
num_attention_heads=4,
image_size=32,
intermediate_size=37,
- patch_size=1, )
+ patch_size=1,
+ )
return CLIPVisionModelWithProjection(config)
@property
@@ -123,13 +133,14 @@ def dummy_decoder(self):
"out_channels": 6,
"down_block_types": (
"ResnetDownsampleBlock2D",
- "SimpleCrossAttnDownBlock2D", ),
- "up_block_types":
- ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+ "SimpleCrossAttnDownBlock2D",
+ ),
+ "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
"block_out_channels": (
self.block_out_channels_0,
- self.block_out_channels_0 * 2, ),
+ self.block_out_channels_0 * 2,
+ ),
"layers_per_block": 1,
"cross_attention_dim": self.cross_attention_dim,
"attention_head_dim": 4,
@@ -144,13 +155,12 @@ def dummy_super_res_kwargs(self):
return {
"sample_size": 64,
"layers_per_block": 1,
- "down_block_types":
- ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
- "up_block_types":
- ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+ "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+ "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
"block_out_channels": (
self.block_out_channels_0,
- self.block_out_channels_0 * 2, ),
+ self.block_out_channels_0 * 2,
+ ),
"in_channels": 6,
"out_channels": 3,
}
@@ -177,11 +187,13 @@ def get_dummy_components(self):
decoder_scheduler = UnCLIPScheduler(
variance_type="learned_range",
prediction_type="epsilon",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
super_res_scheduler = UnCLIPScheduler(
variance_type="fixed_small_log",
prediction_type="epsilon",
- num_train_timesteps=1000, )
+ num_train_timesteps=1000,
+ )
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
image_encoder = self.dummy_image_encoder
return {
@@ -207,8 +219,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
if pil_image:
input_image = input_image * 0.5 + 0.5
input_image = input_image.clip(min=0, max=1)
- input_image = (input_image.cpu().transpose(
- perm=[0, 2, 3, 1]).cast("float32").numpy())
+ input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
return {
"image": input_image,
@@ -230,20 +241,21 @@ def test_unclip_image_variation_input_tensor(self):
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 2.7585030e-03,
- 2.6383996e-04,
- 9.9801058e-01,
- 2.6383996e-04,
- 9.9531418e-01,
- 9.9220645e-01,
- 3.6702752e-03,
- 9.9970925e-01,
- 9.9973619e-01,
- ])
+ expected_slice = np.array(
+ [
+ 2.7585030e-03,
+ 2.6383996e-04,
+ 9.9801058e-01,
+ 2.6383996e-04,
+ 9.9531418e-01,
+ 9.9220645e-01,
+ 3.6702752e-03,
+ 9.9970925e-01,
+ 9.9973619e-01,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_unclip_image_variation_input_image(self):
components = self.get_dummy_components()
@@ -257,28 +269,28 @@ def test_unclip_image_variation_input_image(self):
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
- expected_slice = np.array([
- 5.2168965e-04,
- 9.9861604e-01,
- 9.9755847e-01,
- 9.9804187e-01,
- 9.9411416e-01,
- 9.9248302e-01,
- 9.9973619e-01,
- 9.9777901e-01,
- 9.9973619e-01,
- ])
+ expected_slice = np.array(
+ [
+ 5.2168965e-04,
+ 9.9861604e-01,
+ 9.9755847e-01,
+ 9.9804187e-01,
+ 9.9411416e-01,
+ 9.9248302e-01,
+ 9.9973619e-01,
+ 9.9777901e-01,
+ 9.9973619e-01,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_unclip_image_variation_input_list_images(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipeline_inputs = self.get_dummy_inputs(pil_image=True)
- pipeline_inputs[
- "image"] = [pipeline_inputs["image"], pipeline_inputs["image"]]
+ pipeline_inputs["image"] = [pipeline_inputs["image"], pipeline_inputs["image"]]
output = pipe(**pipeline_inputs)
image = output.images
tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=True)
@@ -290,20 +302,21 @@ def test_unclip_image_variation_input_list_images(self):
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (2, 64, 64, 3)
- expected_slice = np.array([
- 5.2201748e-04,
- 9.9861759e-01,
- 9.9755961e-01,
- 9.9804127e-01,
- 9.9411547e-01,
- 9.9248385e-01,
- 9.9973619e-01,
- 9.9777836e-01,
- 9.9973619e-01,
- ])
+ expected_slice = np.array(
+ [
+ 5.2201748e-04,
+ 9.9861759e-01,
+ 9.9755961e-01,
+ 9.9804127e-01,
+ 9.9411547e-01,
+ 9.9248385e-01,
+ 9.9973619e-01,
+ 9.9777836e-01,
+ 9.9973619e-01,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_unclip_passed_image_embed(self):
class DummyScheduler:
@@ -319,29 +332,34 @@ class DummyScheduler:
batch_size,
pipe.decoder.config.in_channels,
pipe.decoder.config.sample_size,
- pipe.decoder.config.sample_size, )
+ pipe.decoder.config.sample_size,
+ )
decoder_latents = pipe.prepare_latents(
shape,
dtype=dtype,
generator=generator,
latents=None,
- scheduler=DummyScheduler(), )
+ scheduler=DummyScheduler(),
+ )
shape = (
batch_size,
pipe.super_res_first.config.in_channels // 2,
pipe.super_res_first.config.sample_size,
- pipe.super_res_first.config.sample_size, )
+ pipe.super_res_first.config.sample_size,
+ )
super_res_latents = pipe.prepare_latents(
shape,
dtype=dtype,
generator=generator,
latents=None,
- scheduler=DummyScheduler(), )
+ scheduler=DummyScheduler(),
+ )
pipeline_inputs = self.get_dummy_inputs(pil_image=False)
img_out_1 = pipe(
**pipeline_inputs,
decoder_latents=decoder_latents,
- super_res_latents=super_res_latents, ).images
+ super_res_latents=super_res_latents,
+ ).images
pipeline_inputs = self.get_dummy_inputs(pil_image=False)
image = pipeline_inputs.pop("image")
image_embeddings = pipe.image_encoder(image).image_embeds
@@ -349,7 +367,8 @@ class DummyScheduler:
**pipeline_inputs,
decoder_latents=decoder_latents,
super_res_latents=super_res_latents,
- image_embeddings=image_embeddings, ).images
+ image_embeddings=image_embeddings,
+ ).images
assert np.abs(img_out_1 - img_out_2).max() < 0.0001
def test_attention_slicing_forward_pass(self):
@@ -358,8 +377,8 @@ def test_attention_slicing_forward_pass(self):
expected_max_diff = 1e-2
self._test_attention_slicing_forward_pass(
- test_max_difference=test_max_difference,
- expected_max_diff=expected_max_diff)
+ test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
+ )
def test_inference_batch_single_identical(self):
test_max_difference = False
@@ -398,11 +417,9 @@ def test_unclip_image_variation_karlo(self):
input_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
)
- expected_image = np.array([[0.09096909, 0.13343304, 0.26244187],
- [0.15095001, 0.19459972, 0.3182609]])
+ expected_image = np.array([[0.09096909, 0.13343304, 0.26244187], [0.15095001, 0.19459972, 0.3182609]])
# TODO(wugaosheng): test this function
- pipeline = UnCLIPImageVariationPipeline.from_pretrained(
- "kakaobrain/karlo-v1-alpha-image-variations")
+ pipeline = UnCLIPImageVariationPipeline.from_pretrained("kakaobrain/karlo-v1-alpha-image-variations")
pipeline.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
output = pipeline(input_image, generator=generator, output_type="np")
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
index 35b1372d082b8..c3906861b23a7 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
@@ -21,8 +21,7 @@
import paddle
from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-from ppdiffusers.utils.testing_utils import (load_image, nightly,
- require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu
@nightly
@@ -34,8 +33,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_remove_unused_weights_save_load(self):
- pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.remove_unused_weights()
pipe.set_progress_bar_config(disable=None)
second_prompt = load_image(
@@ -49,11 +47,11 @@ def test_remove_unused_weights_save_load(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname, from_diffusers=False)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
new_image = pipe(
@@ -63,13 +61,12 @@ def test_remove_unused_weights_save_load(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
- assert (np.abs(image - new_image).sum() < 1e-05
- ), "Models don't have the same forward pass"
+ output_type="numpy",
+ ).images
+ assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
def test_inference_dual_guided(self):
- pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.remove_unused_weights()
pipe.set_progress_bar_config(disable=None)
first_prompt = "cyberpunk 2077"
@@ -84,18 +81,21 @@ def test_inference_dual_guided(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.01500076,
- 0.01142624,
- 0.01418972,
- 0.01518875,
- 0.01114869,
- 0.01190853,
- 0.02978998,
- 0.02376354,
- 0.02396089,
- ])
+ expected_slice = np.array(
+ [
+ 0.01500076,
+ 0.01142624,
+ 0.01418972,
+ 0.01518875,
+ 0.01114869,
+ 0.01190853,
+ 0.02978998,
+ 0.02376354,
+ 0.02396089,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
index fbc38ee9f49a1..8335bdf260d7a 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
@@ -19,8 +19,7 @@
import paddle
from ppdiffusers import VersatileDiffusionImageVariationPipeline
-from ppdiffusers.utils.testing_utils import (load_image, require_paddle_gpu,
- slow)
+from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, slow
class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
@@ -29,11 +28,9 @@ class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
@slow
@require_paddle_gpu
-class VersatileDiffusionImageVariationPipelineIntegrationTests(
- unittest.TestCase):
+class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
def test_inference_image_variations(self):
- pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.set_progress_bar_config(disable=None)
image_prompt = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
@@ -44,18 +41,21 @@ def test_inference_image_variations(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([
- 0.12047189,
- 0.19138041,
- 0.22884357,
- 0.08833978,
- 0.1594424,
- 0.16826832,
- 0.07032129,
- 0.14926612,
- 0.12981007,
- ])
+ expected_slice = np.array(
+ [
+ 0.12047189,
+ 0.19138041,
+ 0.22884357,
+ 0.08833978,
+ 0.1594424,
+ 0.16826832,
+ 0.07032129,
+ 0.14926612,
+ 0.12981007,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
index ed49997b5a89b..aab7e81ba0c40 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -21,8 +21,7 @@
import paddle
from ppdiffusers import VersatileDiffusionPipeline
-from ppdiffusers.utils.testing_utils import (load_image, nightly,
- require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu
class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
@@ -38,8 +37,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_from_save_pretrained(self):
- pipe = VersatileDiffusionPipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.set_progress_bar_config(disable=None)
prompt_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
@@ -52,11 +50,11 @@ def test_from_save_pretrained(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = VersatileDiffusionPipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
new_image = pipe.dual_guided(
@@ -66,13 +64,12 @@ def test_from_save_pretrained(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
- assert (np.abs(image - new_image).sum() < 1e-05
- ), "Models don't have the same forward pass"
+ output_type="numpy",
+ ).images
+ assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
def test_inference_dual_guided_then_text_to_image(self):
- pipe = VersatileDiffusionPipeline.from_pretrained(
- "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16)
+ pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", paddle_dtype=paddle.float16)
pipe.set_progress_bar_config(disable=None)
prompt = "cyberpunk 2077"
init_image = load_image(
@@ -86,21 +83,24 @@ def test_inference_dual_guided_then_text_to_image(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
# expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001])
- expected_slice = np.array([
- 0.03100586,
- 0.02929688,
- 0.03271484,
- 0.02807617,
- 0.02905273,
- 0.03173828,
- 0.02685547,
- 0.02807617,
- 0.03271484,
- ])
+ expected_slice = np.array(
+ [
+ 0.03100586,
+ 0.02929688,
+ 0.03271484,
+ 0.02807617,
+ 0.02905273,
+ 0.03173828,
+ 0.02685547,
+ 0.02807617,
+ 0.03271484,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
prompt = "A painting of a squirrel eating a burger "
generator = paddle.Generator().manual_seed(0)
@@ -109,36 +109,40 @@ def test_inference_dual_guided_then_text_to_image(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
# expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.387, 0.479, 0.3796, 0.4009, 0.4878, 0.4778])
- expected_slice = np.array([
- 0.0390625,
- 0.00854492,
- 0.0,
- 0.03930664,
- 0.00878906,
- 0.04711914,
- 0.03686523,
- 0.0,
- 0.0246582,
- ])
+ expected_slice = np.array(
+ [
+ 0.0390625,
+ 0.00854492,
+ 0.0,
+ 0.03930664,
+ 0.00878906,
+ 0.04711914,
+ 0.03686523,
+ 0.0,
+ 0.0246582,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
- image = pipe.image_variation(
- init_image, generator=generator, output_type="numpy").images
+ image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
# expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.377, 0.3894, 0.4297, 0.4331, 0.4456])
- expected_slice = np.array([
- 0.34472656,
- 0.1940918,
- 0.10546875,
- 0.38134766,
- 0.24560547,
- 0.13208008,
- 0.38867188,
- 0.30566406,
- 0.18188477,
- ])
+ expected_slice = np.array(
+ [
+ 0.34472656,
+ 0.1940918,
+ 0.10546875,
+ 0.38134766,
+ 0.24560547,
+ 0.13208008,
+ 0.38867188,
+ 0.30566406,
+ 0.18188477,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
index fbe47142eafcb..c95b30030f3d5 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
@@ -37,8 +37,7 @@ def tearDown(self):
paddle.device.cuda.empty_cache()
def test_remove_unused_weights_save_load(self):
- pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.remove_unused_weights()
pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger "
@@ -48,11 +47,11 @@ def test_remove_unused_weights_save_load(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
- pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
- tmpdirname, from_diffusers=False)
+ pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname, from_diffusers=False)
pipe.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
new_image = pipe(
@@ -60,13 +59,12 @@ def test_remove_unused_weights_save_load(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=2,
- output_type="numpy", ).images
- assert (np.abs(image - new_image).sum() < 1e-05
- ), "Models don't have the same forward pass"
+ output_type="numpy",
+ ).images
+ assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
def test_inference_text2img(self):
- pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
- "shi-labs/versatile-diffusion")
+ pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger "
generator = paddle.Generator().manual_seed(0)
@@ -75,19 +73,22 @@ def test_inference_text2img(self):
generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
- output_type="numpy", ).images
+ output_type="numpy",
+ ).images
image_slice = image[0, 253:256, 253:256, -1]
assert image.shape == (1, 512, 512, 3)
# expected_slice = np.array([0.3493, 0.3757, 0.4093, 0.4495, 0.4233, 0.4102, 0.4507, 0.4756, 0.4787])
- expected_slice = np.array([
- 0.0390625,
- 0.00854492,
- 0.0,
- 0.03930664,
- 0.00878906,
- 0.04711914,
- 0.03686523,
- 0.0,
- 0.0246582,
- ])
+ expected_slice = np.array(
+ [
+ 0.0390625,
+ 0.00854492,
+ 0.0,
+ 0.03930664,
+ 0.00878906,
+ 0.04711914,
+ 0.03686523,
+ 0.0,
+ 0.0246582,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
index 5c65fd95fc95f..c17b7fd1d7257 100644
--- a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -20,10 +20,15 @@
import paddle
from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (Transformer2DModel, VQDiffusionPipeline,
- VQDiffusionScheduler, VQModel)
-from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import \
- LearnedClassifierFreeSamplingEmbeddings
+from ppdiffusers import (
+ Transformer2DModel,
+ VQDiffusionPipeline,
+ VQDiffusionScheduler,
+ VQModel,
+)
+from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import (
+ LearnedClassifierFreeSamplingEmbeddings,
+)
from ppdiffusers.utils import load_numpy, slow
from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -57,13 +62,13 @@ def dummy_vqvae(self):
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=3,
num_vq_embeddings=self.num_embed,
- vq_embed_dim=3, )
+ vq_embed_dim=3,
+ )
return model
@property
def dummy_tokenizer(self):
- tokenizer = CLIPTokenizer.from_pretrained(
- "hf-internal-testing/tiny-random-clip")
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
@@ -78,7 +83,8 @@ def dummy_text_encoder(self):
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
- vocab_size=1000, )
+ vocab_size=1000,
+ )
return CLIPTextModel(config).eval()
@property
@@ -106,8 +112,7 @@ def test_vq_diffusion(self):
tokenizer = self.dummy_tokenizer
transformer = self.dummy_transformer
scheduler = VQDiffusionScheduler(self.num_embed)
- learned_classifier_free_sampling_embeddings = (
- LearnedClassifierFreeSamplingEmbeddings(learnable=False))
+ learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
pipe = VQDiffusionPipeline(
vqvae=vqvae,
text_encoder=text_encoder,
@@ -119,11 +124,7 @@ def test_vq_diffusion(self):
pipe.set_progress_bar_config(disable=None)
prompt = "teddy bear playing in the pool"
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- [prompt],
- generator=generator,
- num_inference_steps=2,
- output_type="np")
+ output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = pipe(
@@ -131,24 +132,26 @@ def test_vq_diffusion(self):
generator=generator,
output_type="np",
return_dict=False,
- num_inference_steps=2, )[0]
+ num_inference_steps=2,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 24, 24, 3)
- expected_slice = np.array([
- 0.5900591,
- 0.83443725,
- 0.4418438,
- 0.604656,
- 0.89781034,
- 0.40088692,
- 0.6107253,
- 0.87849474,
- 0.64088374,
- ])
+ expected_slice = np.array(
+ [
+ 0.5900591,
+ 0.83443725,
+ 0.4418438,
+ 0.604656,
+ 0.89781034,
+ 0.40088692,
+ 0.6107253,
+ 0.87849474,
+ 0.64088374,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
def test_vq_diffusion_classifier_free_sampling(self):
vqvae = self.dummy_vqvae
@@ -156,11 +159,11 @@ def test_vq_diffusion_classifier_free_sampling(self):
tokenizer = self.dummy_tokenizer
transformer = self.dummy_transformer
scheduler = VQDiffusionScheduler(self.num_embed)
- learned_classifier_free_sampling_embeddings = (
- LearnedClassifierFreeSamplingEmbeddings(
- learnable=True,
- hidden_size=self.text_embedder_hidden_size,
- length=tokenizer.model_max_length, ))
+ learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
+ learnable=True,
+ hidden_size=self.text_embedder_hidden_size,
+ length=tokenizer.model_max_length,
+ )
pipe = VQDiffusionPipeline(
vqvae=vqvae,
text_encoder=text_encoder,
@@ -172,11 +175,7 @@ def test_vq_diffusion_classifier_free_sampling(self):
pipe.set_progress_bar_config(disable=None)
prompt = "teddy bear playing in the pool"
generator = paddle.Generator().manual_seed(0)
- output = pipe(
- [prompt],
- generator=generator,
- num_inference_steps=2,
- output_type="np")
+ output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
image = output.images
generator = paddle.Generator().manual_seed(0)
image_from_tuple = pipe(
@@ -184,24 +183,26 @@ def test_vq_diffusion_classifier_free_sampling(self):
generator=generator,
output_type="np",
return_dict=False,
- num_inference_steps=2, )[0]
+ num_inference_steps=2,
+ )[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 24, 24, 3)
- expected_slice = np.array([
- 0.61711097,
- 0.8419658,
- 0.5493732,
- 0.64064896,
- 0.97944254,
- 0.5611503,
- 0.6145399,
- 0.7063037,
- 0.54406035,
- ])
+ expected_slice = np.array(
+ [
+ 0.61711097,
+ 0.8419658,
+ 0.5493732,
+ 0.64064896,
+ 0.97944254,
+ 0.5611503,
+ 0.6145399,
+ 0.7063037,
+ 0.54406035,
+ ]
+ )
assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
- ) < 0.01
+ assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
@slow
@@ -216,8 +217,7 @@ def test_vq_diffusion_classifier_free_sampling(self):
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
)
- pipeline = VQDiffusionPipeline.from_pretrained(
- "microsoft/vq-diffusion-ithq")
+ pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
pipeline = pipeline
pipeline.set_progress_bar_config(disable=None)
generator = paddle.Generator().manual_seed(0)
@@ -225,7 +225,8 @@ def test_vq_diffusion_classifier_free_sampling(self):
"teddy bear playing in the pool",
num_images_per_prompt=1,
generator=generator,
- output_type="np", )
+ output_type="np",
+ )
image = output.images[0]
assert image.shape == (256, 256, 3)
assert np.abs(expected_image - image).max() < 0.01
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
index ce993b9501fb1..c578c2ffb27cd 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
@@ -20,7 +20,7 @@
class DDIMSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (DDIMScheduler, )
+ scheduler_classes = (DDIMScheduler,)
forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
def get_scheduler_config(self, **kwargs):
@@ -65,12 +65,10 @@ def test_steps_offset(self):
scheduler_config = self.get_scheduler_config(steps_offset=1)
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(5)
- assert paddle.equal_all(scheduler.timesteps,
- paddle.to_tensor([801, 601, 401, 201, 1]))
+ assert paddle.equal_all(scheduler.timesteps, paddle.to_tensor([801, 601, 401, 201, 1]))
def test_betas(self):
- for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1],
- [0.002, 0.02, 0.2, 2]):
+ for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -92,7 +90,8 @@ def test_thresholding(self):
self.check_over_configs(
thresholding=True,
prediction_type=prediction_type,
- sample_max_value=threshold, )
+ sample_max_value=threshold,
+ )
def test_time_indices(self):
for t in [1, 10, 49]:
@@ -100,8 +99,7 @@ def test_time_indices(self):
def test_inference_steps(self):
for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
- self.check_over_forward(
- time_step=t, num_inference_steps=num_inference_steps)
+ self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
def test_eta(self):
for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
@@ -112,18 +110,12 @@ def test_variance(self):
scheduler_config = self.get_scheduler_config()
scheduler = scheduler_class(**scheduler_config)
- assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) -
- 0.0)) < 1e-5
- assert (paddle.sum(
- paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5)
- assert (paddle.sum(
- paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5)
- assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) -
- 0.0)) < 1e-5
- assert (paddle.sum(
- paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5)
- assert paddle.sum(
- paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
def test_full_loop_no_noise(self):
sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
index e9fa28609abda..9768d50cc5dbc 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
@@ -20,7 +20,7 @@
class DDPMSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (DDPMScheduler, )
+ scheduler_classes = (DDPMScheduler,)
def get_scheduler_config(self, **kwargs):
config = {
@@ -40,8 +40,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1],
- [0.002, 0.02, 0.2, 2]):
+ for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -63,7 +62,8 @@ def test_thresholding(self):
self.check_over_configs(
thresholding=True,
prediction_type=prediction_type,
- sample_max_value=threshold, )
+ sample_max_value=threshold,
+ )
def test_prediction_type(self):
for prediction_type in ["epsilon", "sample", "v_prediction"]:
@@ -79,10 +79,8 @@ def test_variance(self):
scheduler = scheduler_class(**scheduler_config)
assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
- assert paddle.sum(paddle.abs(scheduler._get_variance(487) -
- 0.00979)) < 1e-5
- assert paddle.sum(paddle.abs(scheduler._get_variance(999) -
- 0.02)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
def test_full_loop_no_noise(self):
scheduler_class = self.scheduler_classes[0]
@@ -100,8 +98,7 @@ def test_full_loop_no_noise(self):
residual = model(sample, t)
# 2. predict previous mean of sample x_t-1
- pred_prev_sample = scheduler.step(
- residual, t, sample, generator=generator).prev_sample
+ pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
# if t > 0:
# noise = self.dummy_sample_deter
@@ -118,8 +115,7 @@ def test_full_loop_no_noise(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
num_trained_timesteps = len(scheduler)
@@ -133,8 +129,7 @@ def test_full_loop_with_v_prediction(self):
residual = model(sample, t)
# 2. predict previous mean of sample x_t-1
- pred_prev_sample = scheduler.step(
- residual, t, sample, generator=generator).prev_sample
+ pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
# if t > 0:
# noise = self.dummy_sample_deter
@@ -178,13 +173,10 @@ def test_custom_timesteps_increasing_order(self):
timesteps = [100, 87, 50, 51, 0]
- with self.assertRaises(
- ValueError,
- msg="`custom_timesteps` must be in descending order."):
+ with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
scheduler.set_timesteps(timesteps=timesteps)
- def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(
- self):
+ def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
scheduler_class = self.scheduler_classes[0]
scheduler_config = self.get_scheduler_config()
scheduler = scheduler_class(**scheduler_config)
@@ -193,11 +185,10 @@ def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(
num_inference_steps = len(timesteps)
with self.assertRaises(
- ValueError,
- msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.",
+ ValueError,
+ msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.",
):
- scheduler.set_timesteps(
- num_inference_steps=num_inference_steps, timesteps=timesteps)
+ scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
def test_custom_timesteps_too_large(self):
scheduler_class = self.scheduler_classes[0]
@@ -207,7 +198,7 @@ def test_custom_timesteps_too_large(self):
timesteps = [scheduler.config.num_train_timesteps]
with self.assertRaises(
- ValueError,
- msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+ ValueError,
+ msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
):
scheduler.set_timesteps(timesteps=timesteps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_deis.py b/ppdiffusers/tests/schedulers/test_scheduler_deis.py
index b40af9f177525..7ea11c2198020 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_deis.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_deis.py
@@ -16,15 +16,19 @@
import paddle
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ UniPCMultistepScheduler,
+)
from .test_schedulers import SchedulerCommonTest
class DEISMultistepSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (DEISMultistepScheduler, )
- forward_default_kwargs = (("num_inference_steps", 25), )
+ scheduler_classes = (DEISMultistepScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 25),)
def get_scheduler_config(self, **kwargs):
config = {
@@ -43,38 +47,28 @@ def check_over_configs(self, time_step=0, **config):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config(**config)
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
output, new_output = sample, sample
- for t in range(time_step,
- time_step + scheduler.config.solver_order + 1):
- output = scheduler.step(residual, t, output,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, t, new_output,
- **kwargs).prev_sample
+ for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+ output = scheduler.step(residual, t, output, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
pass
@@ -84,9 +78,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config()
@@ -94,8 +86,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals (must be after setting timesteps)
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
@@ -104,18 +95,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residual (must be after setting timesteps)
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, scheduler=None, **config):
if scheduler is None:
@@ -150,27 +135,20 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# copy over dummy past residuals (must be done after set_timesteps)
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
time_step_0 = scheduler.timesteps[5]
time_step_1 = scheduler.timesteps[6]
- output_0 = scheduler.step(residual, time_step_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, time_step_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
@@ -210,7 +188,8 @@ def test_thresholding(self):
sample_max_value=threshold,
algorithm_type="deis",
solver_order=order,
- solver_type=solver_type, )
+ solver_type=solver_type,
+ )
def test_prediction_type(self):
for prediction_type in ["epsilon", "v_prediction"]:
@@ -225,14 +204,15 @@ def test_solver_order_and_type(self):
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
+ algorithm_type=algorithm_type,
+ )
sample = self.full_loop(
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
- assert not paddle.isnan(sample).any(
- ), "Samples have nan numbers"
+ algorithm_type=algorithm_type,
+ )
+ assert not paddle.isnan(sample).any(), "Samples have nan numbers"
def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
@@ -240,8 +220,7 @@ def test_lower_order_final(self):
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
- self.check_over_forward(
- num_inference_steps=num_inference_steps, time_step=0)
+ self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
def test_full_loop_no_noise(self):
sample = self.full_loop()
@@ -257,8 +236,7 @@ def test_full_loop_with_v_prediction(self):
def test_fp16_support(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- thresholding=True, dynamic_thresholding_ratio=0)
+ scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
scheduler = scheduler_class(**scheduler_config)
num_inference_steps = 10
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
index 8935cd0ba072e..869b1cc9280d1 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
@@ -16,15 +16,19 @@
import paddle
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ UniPCMultistepScheduler,
+)
from .test_schedulers import SchedulerCommonTest
class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (DPMSolverMultistepScheduler, )
- forward_default_kwargs = (("num_inference_steps", 25), )
+ scheduler_classes = (DPMSolverMultistepScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 25),)
def get_scheduler_config(self, **kwargs):
config = {
@@ -49,38 +53,28 @@ def check_over_configs(self, time_step=0, **config):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config(**config)
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
output, new_output = sample, sample
- for t in range(time_step,
- time_step + scheduler.config.solver_order + 1):
- output = scheduler.step(residual, t, output,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, t, new_output,
- **kwargs).prev_sample
+ for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+ output = scheduler.step(residual, t, output, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
pass
@@ -90,9 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config()
@@ -100,8 +92,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals (must be after setting timesteps)
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
@@ -110,18 +101,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residual (must be after setting timesteps)
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, scheduler=None, **config):
if scheduler is None:
@@ -152,27 +137,20 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# copy over dummy past residuals (must be done after set_timesteps)
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
time_step_0 = scheduler.timesteps[5]
time_step_1 = scheduler.timesteps[6]
- output_0 = scheduler.step(residual, time_step_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, time_step_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
@@ -193,7 +171,8 @@ def test_thresholding(self):
sample_max_value=threshold,
algorithm_type="dpmsolver++",
solver_order=order,
- solver_type=solver_type, )
+ solver_type=solver_type,
+ )
def test_prediction_type(self):
for prediction_type in ["epsilon", "v_prediction"]:
@@ -208,14 +187,15 @@ def test_solver_order_and_type(self):
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
+ algorithm_type=algorithm_type,
+ )
sample = self.full_loop(
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
- assert not paddle.isnan(sample).any(
- ), "Samples have nan numbers"
+ algorithm_type=algorithm_type,
+ )
+ assert not paddle.isnan(sample).any(), "Samples have nan numbers"
def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
@@ -223,8 +203,7 @@ def test_lower_order_final(self):
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
- self.check_over_forward(
- num_inference_steps=num_inference_steps, time_step=0)
+ self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
def test_full_loop_no_noise(self):
sample = self.full_loop()
@@ -233,10 +212,7 @@ def test_full_loop_no_noise(self):
assert abs(result_mean.item() - 0.3301) < 1e-3
def test_full_loop_no_noise_thres(self):
- sample = self.full_loop(
- thresholding=True,
- dynamic_thresholding_ratio=0.87,
- sample_max_value=0.5)
+ sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
result_mean = paddle.mean(paddle.abs(sample))
assert abs(result_mean.item() - 1.1364) < 1e-3
@@ -248,8 +224,7 @@ def test_full_loop_with_v_prediction(self):
assert abs(result_mean.item() - 0.2251) < 1e-3
def test_full_loop_with_karras_and_v_prediction(self):
- sample = self.full_loop(
- prediction_type="v_prediction", use_karras_sigmas=True)
+ sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
result_mean = paddle.mean(paddle.abs(sample))
assert abs(result_mean.item() - 0.2096) < 1e-3
@@ -275,8 +250,7 @@ def test_switch(self):
def test_fp16_support(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- thresholding=True, dynamic_thresholding_ratio=0)
+ scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
scheduler = scheduler_class(**scheduler_config)
num_inference_steps = 10
@@ -297,5 +271,4 @@ def test_unique_timesteps(self, **config):
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(scheduler.config.num_train_timesteps)
- assert len(scheduler.timesteps.unique(
- )) == scheduler.num_inference_steps
+ assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
index bb702887ed40f..ce229323bc363 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
@@ -16,15 +16,19 @@
import paddle
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ UniPCMultistepScheduler,
+)
from .test_schedulers import SchedulerCommonTest
class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (DPMSolverSinglestepScheduler, )
- forward_default_kwargs = (("num_inference_steps", 25), )
+ scheduler_classes = (DPMSolverSinglestepScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 25),)
def get_scheduler_config(self, **kwargs):
config = {
@@ -48,38 +52,28 @@ def check_over_configs(self, time_step=0, **config):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config(**config)
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
output, new_output = sample, sample
- for t in range(time_step,
- time_step + scheduler.config.solver_order + 1):
- output = scheduler.step(residual, t, output,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, t, new_output,
- **kwargs).prev_sample
+ for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+ output = scheduler.step(residual, t, output, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
pass
@@ -89,9 +83,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config()
@@ -99,8 +91,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals (must be after setting timesteps)
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
@@ -109,18 +100,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residual (must be after setting timesteps)
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, scheduler=None, **config):
if scheduler is None:
@@ -178,7 +163,8 @@ def test_thresholding(self):
sample_max_value=threshold,
algorithm_type="dpmsolver++",
solver_order=order,
- solver_type=solver_type, )
+ solver_type=solver_type,
+ )
def test_prediction_type(self):
for prediction_type in ["epsilon", "v_prediction"]:
@@ -193,14 +179,15 @@ def test_solver_order_and_type(self):
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
+ algorithm_type=algorithm_type,
+ )
sample = self.full_loop(
solver_order=order,
solver_type=solver_type,
prediction_type=prediction_type,
- algorithm_type=algorithm_type, )
- assert not paddle.isnan(sample).any(
- ), "Samples have nan numbers"
+ algorithm_type=algorithm_type,
+ )
+ assert not paddle.isnan(sample).any(), "Samples have nan numbers"
def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
@@ -208,8 +195,7 @@ def test_lower_order_final(self):
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
- self.check_over_forward(
- num_inference_steps=num_inference_steps, time_step=0)
+ self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
def test_full_loop_no_noise(self):
sample = self.full_loop()
@@ -225,8 +211,7 @@ def test_full_loop_with_v_prediction(self):
def test_fp16_support(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- thresholding=True, dynamic_thresholding_ratio=0)
+ scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
scheduler = scheduler_class(**scheduler_config)
num_inference_steps = 10
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler.py b/ppdiffusers/tests/schedulers/test_scheduler_euler.py
index bdca25bba1cb3..d6cfc9fe4474b 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_euler.py
@@ -20,7 +20,7 @@
class EulerDiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (EulerDiscreteScheduler, )
+ scheduler_classes = (EulerDiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -68,8 +67,7 @@ def test_full_loop_no_noise(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -80,8 +78,7 @@ def test_full_loop_no_noise(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
@@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -123,8 +119,7 @@ def test_full_loop_device(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -150,8 +145,7 @@ def test_full_loop_device_karras_sigmas(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
index cb2d308947d3b..fdc7f2a34f30f 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
@@ -20,7 +20,7 @@
class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (EulerAncestralDiscreteScheduler, )
+ scheduler_classes = (EulerAncestralDiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -68,8 +67,7 @@ def test_full_loop_no_noise(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -80,8 +78,7 @@ def test_full_loop_no_noise(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
@@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -122,8 +118,7 @@ def test_full_loop_device(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_heun.py b/ppdiffusers/tests/schedulers/test_scheduler_heun.py
index b8223700592bb..0f62ae519f4e0 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_heun.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_heun.py
@@ -20,7 +20,7 @@
class HeunDiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (HeunDiscreteScheduler, )
+ scheduler_classes = (HeunDiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -78,8 +77,7 @@ def test_full_loop_no_noise(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
index 39558436871af..c282c6a61079b 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
@@ -22,8 +22,8 @@
class IPNDMSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (IPNDMScheduler, )
- forward_default_kwargs = (("num_inference_steps", 50), )
+ scheduler_classes = (IPNDMScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 50),)
def get_scheduler_config(self, **kwargs):
config = {"num_train_timesteps": 1000}
@@ -59,21 +59,15 @@ def check_over_configs(self, time_step=0, **config):
# copy over dummy past residuals
new_scheduler.ets = dummy_past_residuals[:]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
pass
@@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
# copy over dummy past residual (must be after setting timesteps)
new_scheduler.ets = dummy_past_residuals[:]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, **config):
scheduler_class = self.scheduler_classes[0]
@@ -158,11 +146,9 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# copy over dummy past residuals (must be done after set_timesteps)
@@ -177,31 +163,25 @@ def test_step_shape(self):
time_step_0 = scheduler.timesteps[5]
time_step_1 = scheduler.timesteps[6]
- output_0 = scheduler.step(residual, time_step_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, time_step_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
- output_0 = scheduler.step(residual, time_step_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, time_step_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
def test_timesteps(self):
for timesteps in [100, 1000]:
- self.check_over_configs(
- num_train_timesteps=timesteps, time_step=None)
+ self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
def test_inference_steps(self):
for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
- self.check_over_forward(
- num_inference_steps=num_inference_steps, time_step=None)
+ self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
def test_full_loop_no_noise(self):
sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
index 4081289cebb20..770b4f226ba5c 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
@@ -20,7 +20,7 @@
class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (KDPM2AncestralDiscreteScheduler, )
+ scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -65,8 +64,7 @@ def test_full_loop_no_noise(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -82,8 +80,7 @@ def test_prediction_type(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
@@ -98,8 +95,7 @@ def test_full_loop_with_v_prediction(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
@@ -125,8 +121,7 @@ def test_full_loop_device(self):
model_output = model(sample, t)
- output = scheduler.step(
- model_output, t, sample, generator=generator)
+ output = scheduler.step(model_output, t, sample, generator=generator)
sample = output.prev_sample
result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
index ee87c662588d7..3da7b7e75fd44 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
@@ -20,7 +20,7 @@
class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (KDPM2DiscreteScheduler, )
+ scheduler_classes = (KDPM2DiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -53,8 +52,7 @@ def test_prediction_type(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_lms.py b/ppdiffusers/tests/schedulers/test_scheduler_lms.py
index 0be32200e94c8..8ee87bbddf624 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_lms.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_lms.py
@@ -20,7 +20,7 @@
class LMSDiscreteSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (LMSDiscreteScheduler, )
+ scheduler_classes = (LMSDiscreteScheduler,)
num_inference_steps = 10
def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
self.check_over_configs(num_train_timesteps=timesteps)
def test_betas(self):
- for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
- [0.0002, 0.002, 0.02]):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
def test_schedules(self):
@@ -81,8 +80,7 @@ def test_full_loop_no_noise(self):
def test_full_loop_with_v_prediction(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- prediction_type="v_prediction")
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
index ab94b8ffca3f3..ad2998c26bfd9 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
@@ -22,8 +22,8 @@
class PNDMSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (PNDMScheduler, )
- forward_default_kwargs = (("num_inference_steps", 50), )
+ scheduler_classes = (PNDMScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 50),)
def get_scheduler_config(self, **kwargs):
config = {
@@ -62,21 +62,15 @@ def check_over_configs(self, time_step=0, **config):
# copy over dummy past residuals
new_scheduler.ets = dummy_past_residuals[:]
- output = scheduler.step_prk(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step_prk(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
- output = scheduler.step_plms(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step_plms(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
pass
@@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
# copy over dummy past residual (must be after setting timesteps)
new_scheduler.ets = dummy_past_residuals[:]
- output = scheduler.step_prk(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step_prk(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
- output = scheduler.step_plms(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step_plms(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, **config):
scheduler_class = self.scheduler_classes[0]
@@ -158,11 +146,9 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# copy over dummy past residuals (must be done after set_timesteps)
@@ -174,18 +160,14 @@ def test_step_shape(self):
]
scheduler.ets = dummy_past_residuals[:]
- output_0 = scheduler.step_prk(residual, 0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step_prk(residual, 1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
- output_0 = scheduler.step_plms(residual, 0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step_plms(residual, 1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
@@ -204,27 +186,30 @@ def test_steps_offset(self):
scheduler.set_timesteps(10)
assert paddle.equal_all(
scheduler.timesteps,
- paddle.to_tensor([
- 901,
- 851,
- 851,
- 801,
- 801,
- 751,
- 751,
- 701,
- 701,
- 651,
- 651,
- 601,
- 601,
- 501,
- 401,
- 301,
- 201,
- 101,
- 1,
- ]), )
+ paddle.to_tensor(
+ [
+ 901,
+ 851,
+ 851,
+ 801,
+ 801,
+ 751,
+ 751,
+ 701,
+ 701,
+ 651,
+ 651,
+ 601,
+ 601,
+ 501,
+ 401,
+ 301,
+ 201,
+ 101,
+ 1,
+ ]
+ ),
+ )
def test_betas(self):
for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
@@ -269,8 +254,7 @@ def test_inference_plms_no_past_residuals(self):
scheduler_config = self.get_scheduler_config()
scheduler = scheduler_class(**scheduler_config)
- scheduler.step_plms(self.dummy_sample, 1,
- self.dummy_sample).prev_sample
+ scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample
def test_full_loop_no_noise(self):
sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
index 3c2c1cd8ac641..ac15c502eda8d 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
@@ -23,7 +23,7 @@
class ScoreSdeVeSchedulerTest(unittest.TestCase):
# TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration)
- scheduler_classes = (ScoreSdeVeScheduler, )
+ scheduler_classes = (ScoreSdeVeScheduler,)
forward_default_kwargs = ()
@property
@@ -85,34 +85,22 @@ def check_over_configs(self, time_step=0, **config):
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
output = scheduler.step_pred(
- residual,
- time_step,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
new_output = new_scheduler.step_pred(
- residual,
- time_step,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
output = scheduler.step_correct(
- residual,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
new_output = new_scheduler.step_correct(
- residual,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler correction are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
def check_over_forward(self, time_step=0, **forward_kwargs):
kwargs = dict(self.forward_default_kwargs)
@@ -130,34 +118,22 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
output = scheduler.step_pred(
- residual,
- time_step,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
new_output = new_scheduler.step_pred(
- residual,
- time_step,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
output = scheduler.step_correct(
- residual,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
new_output = new_scheduler.step_correct(
- residual,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler correction are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
def test_timesteps(self):
for timesteps in [10, 100, 1000]:
@@ -193,15 +169,12 @@ def test_full_loop_no_noise(self):
for _ in range(scheduler.config.correct_steps):
with paddle.no_grad():
model_output = model(sample, sigma_t)
- sample = scheduler.step_correct(
- model_output, sample, generator=generator,
- **kwargs).prev_sample
+ sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample
with paddle.no_grad():
model_output = model(sample, sigma_t)
- output = scheduler.step_pred(
- model_output, t, sample, generator=generator, **kwargs)
+ output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs)
sample, _ = output.prev_sample, output.prev_sample_mean
result_sum = paddle.sum(paddle.abs(sample))
@@ -222,25 +195,17 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
output_0 = scheduler.step_pred(
- residual,
- 0,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, 0, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
output_1 = scheduler.step_pred(
- residual,
- 1,
- sample,
- generator=paddle.Generator().manual_seed(0),
- **kwargs).prev_sample
+ residual, 1, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+ ).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
index 5ac931e6abef5..b37fa2c513271 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
@@ -21,7 +21,7 @@
# UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration.
class UnCLIPSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (UnCLIPScheduler, )
+ scheduler_classes = (UnCLIPScheduler,)
def get_scheduler_config(self, **kwargs):
config = {
@@ -61,36 +61,27 @@ def test_time_indices(self):
if prev_timestep is not None and prev_timestep >= time_step:
continue
- self.check_over_forward(
- time_step=time_step, prev_timestep=prev_timestep)
+ self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep)
def test_variance_fixed_small_log(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- variance_type="fixed_small_log")
+ scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log")
scheduler = scheduler_class(**scheduler_config)
- assert paddle.sum(paddle.abs(scheduler._get_variance(0) -
- 1.0000e-10)) < 1e-5
- assert paddle.sum(
- paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
- assert paddle.sum(
- paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
+ assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
def test_variance_learned_range(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- variance_type="learned_range")
+ scheduler_config = self.get_scheduler_config(variance_type="learned_range")
scheduler = scheduler_class(**scheduler_config)
predicted_variance = 0.5
- assert (scheduler._get_variance(
- 1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5)
- assert (scheduler._get_variance(
- 487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5)
- assert (scheduler._get_variance(
- 999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5)
+ assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5
+ assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5
+ assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5
def test_full_loop(self):
scheduler_class = self.scheduler_classes[0]
@@ -108,8 +99,7 @@ def test_full_loop(self):
residual = model(sample, t)
# 2. predict previous mean of sample x_t-1
- pred_prev_sample = scheduler.step(
- residual, t, sample, generator=generator).prev_sample
+ pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
sample = pred_prev_sample
@@ -143,11 +133,8 @@ def test_full_loop_skip_timesteps(self):
# 2. predict previous mean of sample x_t-1
pred_prev_sample = scheduler.step(
- residual,
- t,
- sample,
- prev_timestep=prev_timestep,
- generator=generator).prev_sample
+ residual, t, sample, prev_timestep=prev_timestep, generator=generator
+ ).prev_sample
sample = pred_prev_sample
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
index 7d28f06cd5fb7..0c19a3bb8387a 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
@@ -16,15 +16,19 @@
import paddle
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
- DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ UniPCMultistepScheduler,
+)
from .test_schedulers import SchedulerCommonTest
class UniPCMultistepSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (UniPCMultistepScheduler, )
- forward_default_kwargs = (("num_inference_steps", 25), )
+ scheduler_classes = (UniPCMultistepScheduler,)
+ forward_default_kwargs = (("num_inference_steps", 25),)
def get_scheduler_config(self, **kwargs):
config = {
@@ -44,47 +48,35 @@ def check_over_configs(self, time_step=0, **config):
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config(**config)
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
output, new_output = sample, sample
- for t in range(time_step,
- time_step + scheduler.config.solver_order + 1):
- output = scheduler.step(residual, t, output,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, t, new_output,
- **kwargs).prev_sample
+ for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+ output = scheduler.step(residual, t, output, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def check_over_forward(self, time_step=0, **forward_kwargs):
kwargs = dict(self.forward_default_kwargs)
num_inference_steps = kwargs.pop("num_inference_steps", None)
sample = self.dummy_sample
residual = 0.1 * sample
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
for scheduler_class in self.scheduler_classes:
scheduler_config = self.get_scheduler_config()
@@ -92,8 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residuals (must be after setting timesteps)
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_config(tmpdirname)
@@ -102,18 +93,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
new_scheduler.set_timesteps(num_inference_steps)
# copy over dummy past residual (must be after setting timesteps)
- new_scheduler.model_outputs = dummy_past_residuals[:
- new_scheduler.
- config.
- solver_order]
+ new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def full_loop(self, scheduler=None, **config):
if scheduler is None:
@@ -148,27 +133,20 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# copy over dummy past residuals (must be done after set_timesteps)
- dummy_past_residuals = [
- residual + 0.2, residual + 0.15, residual + 0.10
- ]
- scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
- solver_order]
+ dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+ scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
time_step_0 = scheduler.timesteps[5]
time_step_1 = scheduler.timesteps[6]
- output_0 = scheduler.step(residual, time_step_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, time_step_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
@@ -207,7 +185,8 @@ def test_thresholding(self):
prediction_type=prediction_type,
sample_max_value=threshold,
solver_order=order,
- solver_type=solver_type, )
+ solver_type=solver_type,
+ )
def test_prediction_type(self):
for prediction_type in ["epsilon", "v_prediction"]:
@@ -220,13 +199,14 @@ def test_solver_order_and_type(self):
self.check_over_configs(
solver_order=order,
solver_type=solver_type,
- prediction_type=prediction_type, )
+ prediction_type=prediction_type,
+ )
sample = self.full_loop(
solver_order=order,
solver_type=solver_type,
- prediction_type=prediction_type, )
- assert not paddle.isnan(sample).any(
- ), "Samples have nan numbers"
+ prediction_type=prediction_type,
+ )
+ assert not paddle.isnan(sample).any(), "Samples have nan numbers"
def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
@@ -234,8 +214,7 @@ def test_lower_order_final(self):
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
- self.check_over_forward(
- num_inference_steps=num_inference_steps, time_step=0)
+ self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
def test_full_loop_no_noise(self):
sample = self.full_loop()
@@ -251,8 +230,7 @@ def test_full_loop_with_v_prediction(self):
def test_fp16_support(self):
scheduler_class = self.scheduler_classes[0]
- scheduler_config = self.get_scheduler_config(
- thresholding=True, dynamic_thresholding_ratio=0)
+ scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
scheduler = scheduler_class(**scheduler_config)
num_inference_steps = 10
@@ -272,5 +250,4 @@ def test_unique_timesteps(self, **config):
scheduler = scheduler_class(**scheduler_config)
scheduler.set_timesteps(scheduler.config.num_train_timesteps)
- assert len(scheduler.timesteps.unique(
- )) == scheduler.num_inference_steps
+ assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
index 81ed3de4a1062..c40e7834d682f 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
@@ -21,7 +21,7 @@
class VQDiffusionSchedulerTest(SchedulerCommonTest):
- scheduler_classes = (VQDiffusionScheduler, )
+ scheduler_classes = (VQDiffusionScheduler,)
def get_scheduler_config(self, **kwargs):
config = {
@@ -37,8 +37,7 @@ def dummy_sample(self, num_vec_classes):
height = 8
width = 8
- sample = paddle.randint(0, num_vec_classes,
- (batch_size, height * width))
+ sample = paddle.randint(0, num_vec_classes, (batch_size, height * width))
return sample
@@ -49,10 +48,8 @@ def dummy_sample_deter(self):
def dummy_model(self, num_vec_classes):
def model(sample, t, *args):
batch_size, num_latent_pixels = sample.shape
- logits = paddle.rand(
- (batch_size, num_vec_classes - 1, num_latent_pixels))
- return_value = F.log_softmax(
- logits.cast("float64"), axis=1).cast("float32")
+ logits = paddle.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
+ return_value = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
return return_value
return model
diff --git a/ppdiffusers/tests/schedulers/test_schedulers.py b/ppdiffusers/tests/schedulers/test_schedulers.py
index f01069d246e6a..92b11a679f661 100755
--- a/ppdiffusers/tests/schedulers/test_schedulers.py
+++ b/ppdiffusers/tests/schedulers/test_schedulers.py
@@ -24,9 +24,14 @@
import paddle
import ppdiffusers
-from ppdiffusers import (EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler, IPNDMScheduler,
- LMSDiscreteScheduler, VQDiffusionScheduler, logging)
+from ppdiffusers import (
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ IPNDMScheduler,
+ LMSDiscreteScheduler,
+ VQDiffusionScheduler,
+ logging,
+)
from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
from ppdiffusers.schedulers.scheduling_utils import SchedulerMixin
from ppdiffusers.utils.testing_utils import CaptureLogger
@@ -37,12 +42,13 @@ class SchedulerObject(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- a=2,
- b=5,
- c=(2, 5),
- d="for diffusion",
- e=[1, 3], ):
+ self,
+ a=2,
+ b=5,
+ c=(2, 5),
+ d="for diffusion",
+ e=[1, 3],
+ ):
pass
@@ -51,12 +57,13 @@ class SchedulerObject2(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- a=2,
- b=5,
- c=(2, 5),
- d="for diffusion",
- f=[1, 3], ):
+ self,
+ a=2,
+ b=5,
+ c=(2, 5),
+ d="for diffusion",
+ f=[1, 3],
+ ):
pass
@@ -65,13 +72,14 @@ class SchedulerObject3(SchedulerMixin, ConfigMixin):
@register_to_config
def __init__(
- self,
- a=2,
- b=5,
- c=(2, 5),
- d="for diffusion",
- e=[1, 3],
- f=[1, 3], ):
+ self,
+ a=2,
+ b=5,
+ c=(2, 5),
+ d="for diffusion",
+ e=[1, 3],
+ f=[1, 3],
+ ):
pass
@@ -90,15 +98,11 @@ def test_save_load_from_different_config(self):
new_obj_1 = SchedulerObject2.from_config(config)
# now save a config parameter that is not expected
- with open(
- os.path.join(tmpdirname, SchedulerObject.config_name),
- "r") as f:
+ with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
data = json.load(f)
data["unexpected"] = True
- with open(
- os.path.join(tmpdirname, SchedulerObject.config_name),
- "w") as f:
+ with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
json.dump(data, f)
with CaptureLogger(logger) as cap_logger_2:
@@ -115,12 +119,12 @@ def test_save_load_from_different_config(self):
assert cap_logger_1.out == ""
assert (
- cap_logger_2.out ==
- "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+ cap_logger_2.out
+ == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
" will"
- " be ignored. Please verify your config.json configuration file.\n")
- assert (cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2")
- == cap_logger_3.out)
+ " be ignored. Please verify your config.json configuration file.\n"
+ )
+ assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
def test_save_load_compatible_schedulers(self):
SchedulerObject2._compatibles = ["SchedulerObject"]
@@ -137,16 +141,12 @@ def test_save_load_compatible_schedulers(self):
obj.save_config(tmpdirname)
# now save a config parameter that is expected by another class, but not origin class
- with open(
- os.path.join(tmpdirname, SchedulerObject.config_name),
- "r") as f:
+ with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
data = json.load(f)
data["f"] = [0, 0]
data["unexpected"] = True
- with open(
- os.path.join(tmpdirname, SchedulerObject.config_name),
- "w") as f:
+ with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
json.dump(data, f)
with CaptureLogger(logger) as cap_logger:
@@ -156,10 +156,11 @@ def test_save_load_compatible_schedulers(self):
assert new_obj.__class__ == SchedulerObject
assert (
- cap_logger.out ==
- "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+ cap_logger.out
+ == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
" will"
- " be ignored. Please verify your config.json configuration file.\n")
+ " be ignored. Please verify your config.json configuration file.\n"
+ )
def test_save_load_from_different_config_comp_schedulers(self):
SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
@@ -195,14 +196,8 @@ def test_save_load_from_different_config_comp_schedulers(self):
assert new_obj_3.__class__ == SchedulerObject3
assert cap_logger_1.out == ""
- assert (
- cap_logger_2.out ==
- "{'f'} was not found in config. Values will be initialized to default values.\n"
- )
- assert (
- cap_logger_3.out ==
- "{'f'} was not found in config. Values will be initialized to default values.\n"
- )
+ assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+ assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
class SchedulerCommonTest(unittest.TestCase):
@@ -252,9 +247,10 @@ def check_over_configs(self, time_step=0, **config):
for scheduler_class in self.scheduler_classes:
# TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
if scheduler_class in (
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler,
- LMSDiscreteScheduler, ):
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ):
time_step = float(time_step)
scheduler_config = self.get_scheduler_config(**config)
@@ -273,12 +269,10 @@ def check_over_configs(self, time_step=0, **config):
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
new_scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# Make sure `scale_model_input` is invoked to prevent a warning
@@ -287,20 +281,15 @@ def check_over_configs(self, time_step=0, **config):
_ = new_scheduler.scale_model_input(sample, 0)
# Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def check_over_forward(self, time_step=0, **forward_kwargs):
kwargs = dict(self.forward_default_kwargs)
@@ -310,9 +299,10 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
for scheduler_class in self.scheduler_classes:
if scheduler_class in (
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler,
- LMSDiscreteScheduler, ):
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ):
time_step = float(time_step)
scheduler_config = self.get_scheduler_config()
@@ -331,28 +321,21 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
new_scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- output = scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- new_output = new_scheduler.step(residual, time_step, sample,
- **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_from_save_pretrained(self):
kwargs = dict(self.forward_default_kwargs)
@@ -362,9 +345,10 @@ def test_from_save_pretrained(self):
for scheduler_class in self.scheduler_classes:
timestep = 1
if scheduler_class in (
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler,
- LMSDiscreteScheduler, ):
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ):
timestep = float(timestep)
scheduler_config = self.get_scheduler_config()
@@ -383,28 +367,21 @@ def test_from_save_pretrained(self):
scheduler.save_config(tmpdirname)
new_scheduler = scheduler_class.from_pretrained(tmpdirname)
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
new_scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- output = scheduler.step(residual, timestep, sample,
- **kwargs).prev_sample
+ output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- new_output = new_scheduler.step(residual, timestep, sample,
- **kwargs).prev_sample
+ new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
- assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
- ), "Scheduler outputs are not identical"
+ assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
def test_compatibles(self):
for scheduler_class in self.scheduler_classes:
@@ -415,31 +392,20 @@ def test_compatibles(self):
assert all(c is not None for c in scheduler.compatibles)
for comp_scheduler_cls in scheduler.compatibles:
- comp_scheduler = comp_scheduler_cls.from_config(
- scheduler.config)
+ comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
assert comp_scheduler is not None
new_scheduler = scheduler_class.from_config(comp_scheduler.config)
- new_scheduler_config = {
- k: v
- for k, v in new_scheduler.config.items()
- if k in scheduler.config
- }
- scheduler_diff = {
- k: v
- for k, v in new_scheduler.config.items()
- if k not in scheduler.config
- }
+ new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
+ scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
# make sure that configs are essentially identical
assert new_scheduler_config == dict(scheduler.config)
# make sure that only differences are for configs that are not in init
- init_keys = inspect.signature(
- scheduler_class.__init__).parameters.keys()
- assert set(scheduler_diff.keys()).intersection(set(
- init_keys)) == set()
+ init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
+ assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
def test_from_pretrained(self):
for scheduler_class in self.scheduler_classes:
@@ -463,9 +429,10 @@ def test_step_shape(self):
for scheduler_class in self.scheduler_classes:
if scheduler_class in (
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler,
- LMSDiscreteScheduler, ):
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ):
timestep_0 = float(timestep_0)
timestep_1 = float(timestep_1)
@@ -481,17 +448,13 @@ def test_step_shape(self):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
- output_0 = scheduler.step(residual, timestep_0, sample,
- **kwargs).prev_sample
- output_1 = scheduler.step(residual, timestep_1, sample,
- **kwargs).prev_sample
+ output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
+ output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
self.assertEqual(output_0.shape, sample.shape)
self.assertEqual(output_0.shape, output_1.shape)
@@ -504,12 +467,10 @@ def set_nan_tensor_to_zero(t):
def recursive_check(tuple_object, dict_object):
if isinstance(tuple_object, (List, Tuple)):
- for tuple_iterable_value, dict_iterable_value in zip(
- tuple_object, dict_object.values()):
+ for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif isinstance(tuple_object, Dict):
- for tuple_iterable_value, dict_iterable_value in zip(
- tuple_object.values(), dict_object.values()):
+ for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif tuple_object is None:
return
@@ -518,27 +479,29 @@ def recursive_check(tuple_object, dict_object):
paddle.allclose(
set_nan_tensor_to_zero(tuple_object).cast("float32"),
set_nan_tensor_to_zero(dict_object).cast("float32"),
- atol=1e-5, ),
+ atol=1e-5,
+ ),
msg=(
"Tuple and dict output are not equal. Difference:"
f" {paddle.max(paddle.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {paddle.isnan(tuple_object).any()} and `inf`: {paddle.isinf(tuple_object)}. Dict has"
f" `nan`: {paddle.isnan(dict_object).any()} and `inf`: {paddle.isinf(dict_object)}."
- ), )
+ ),
+ )
kwargs = dict(self.forward_default_kwargs)
num_inference_steps = kwargs.pop("num_inference_steps", 50)
timestep = 0
- if (len(self.scheduler_classes) > 0 and
- self.scheduler_classes[0] == IPNDMScheduler):
+ if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
timestep = 1
for scheduler_class in self.scheduler_classes:
if scheduler_class in (
- EulerAncestralDiscreteScheduler,
- EulerDiscreteScheduler,
- LMSDiscreteScheduler, ):
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ ):
timestep = float(timestep)
scheduler_config = self.get_scheduler_config()
@@ -553,32 +516,25 @@ def recursive_check(tuple_object, dict_object):
sample = self.dummy_sample
residual = 0.1 * sample
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
- if num_inference_steps is not None and hasattr(scheduler,
- "set_timesteps"):
+ if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
scheduler.set_timesteps(num_inference_steps)
- elif num_inference_steps is not None and not hasattr(
- scheduler, "set_timesteps"):
+ elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
kwargs["num_inference_steps"] = num_inference_steps
# Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
- if "generator" in set(
- inspect.signature(scheduler.step).parameters.keys()):
+ if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
kwargs["generator"] = paddle.Generator().manual_seed(0)
- outputs_tuple = scheduler.step(
- residual, timestep, sample, return_dict=False, **kwargs)
+ outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
recursive_check(outputs_tuple, outputs_dict)
@@ -594,8 +550,11 @@ def test_scheduler_public_api(self):
)
self.assertTrue(
hasattr(scheduler, "scale_model_input"),
- (f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
- " timestep)`"), )
+ (
+ f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
+ " timestep)`"
+ ),
+ )
self.assertTrue(
hasattr(scheduler, "step"),
f"{scheduler_class} does not implement a required class method `step(...)`",
@@ -625,9 +584,7 @@ def test_add_noise_device(self):
def test_deprecated_kwargs(self):
for scheduler_class in self.scheduler_classes:
- has_kwarg_in_model_class = (
- "kwargs" in
- inspect.signature(scheduler_class.__init__).parameters)
+ has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
if has_kwarg_in_model_class and not has_deprecated_kwarg:
@@ -635,7 +592,8 @@ def test_deprecated_kwargs(self):
f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
" kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
" there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
- " []`")
+ " []`"
+ )
if not has_kwarg_in_model_class and has_deprecated_kwarg:
raise ValueError(
@@ -651,8 +609,7 @@ def test_trained_betas(self):
continue
scheduler_config = self.get_scheduler_config()
- scheduler = scheduler_class(
- **scheduler_config, trained_betas=np.array([0.1, 0.3]))
+ scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3]))
with tempfile.TemporaryDirectory() as tmpdirname:
scheduler.save_pretrained(tmpdirname)
@@ -680,8 +637,7 @@ def test_getattr_is_correct(self):
# no warning should be thrown
assert cap_logger.out == ""
- logger = logging.get_logger(
- "ppdiffusers.schedulers.schedulering_utils")
+ logger = logging.get_logger("ppdiffusers.schedulers.schedulering_utils")
# 30 for warning
logger.setLevel(30)
with CaptureLogger(logger) as cap_logger:
@@ -703,7 +659,4 @@ def test_getattr_is_correct(self):
with self.assertRaises(AttributeError) as error:
scheduler.does_not_exist
- assert (
- str(error.exception) ==
- f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"
- )
+ assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..4b868b99b22f9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.isort]
+profile = 'black'
+known_third_party = ["paddle"]
+
+[tool.black]
+line-length = 119
+target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
+exclude = ['.flake8']
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+pythonpath = ["."]
+testpaths = [
+ # "tests/models",
+]
+python_files = [
+ "test.py",
+ "test_*.py"
+]
+filterwarnings = [
+ "ignore::UserWarning",
+ 'ignore::DeprecationWarning',
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2578b4bad4f96..0074ba09ce033 100644
--- a/setup.py
+++ b/setup.py
@@ -46,8 +46,7 @@ def read_requirements():
setup(
name="paddlemix",
- packages=(find_packages() + find_packages(
- where="./ppdiffusers", exclude=["tests", "tests.*"])),
+ packages=(find_packages() + find_packages(where="./ppdiffusers", exclude=["tests", "tests.*"])),
package_dir={
"": ".",
"ppdiffusers": "./ppdiffusers/ppdiffusers",
@@ -62,10 +61,7 @@ def read_requirements():
keywords=["paddle", "paddlemix"],
install_requires=REQUIRED_PACKAGES,
python_requires=">=3.6",
- entry_points={
- "console_scripts":
- ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]
- },
+ entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
classifiers=[
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
@@ -75,4 +71,5 @@ def read_requirements():
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
],
- license="Apache 2.0", )
+ license="Apache 2.0",
+)
diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py
index 4193ca0f0a0de..d11db96722581 100644
--- a/tests/models/test_blip2.py
+++ b/tests/models/test_blip2.py
@@ -21,49 +21,54 @@
import numpy as np
import paddle
import paddle.nn as nn
-import requests
from paddlenlp.transformers.opt.configuration import OPTConfig
-from PIL import Image
-from paddlemix.models.blip2 import (Blip2Config, Blip2ForConditionalGeneration,
- Blip2QFormerConfig, Blip2VisionConfig)
+from paddlemix.models.blip2 import (
+ Blip2Config,
+ Blip2ForConditionalGeneration,
+ Blip2QFormerConfig,
+ Blip2VisionConfig,
+)
from paddlemix.models.blip2.eva_vit import VisionTransformer
-from paddlemix.models.blip2.modeling import \
- BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST
+from paddlemix.models.blip2.modeling import BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST
from paddlemix.models.blip2.Qformer import BertLMHeadModel
from tests.models.test_configuration_common import ConfigTester
from tests.models.test_modeling_common import (
- ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask)
+ ModelTesterMixin,
+ floats_tensor,
+ ids_tensor,
+ random_attention_mask,
+)
from tests.testing_utils import slow
def _config_zero_init(config):
configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys():
- if ("_range" in key or "_std" in key or "initializer_factor" in key or
- "layer_scale" in key):
+ if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
setattr(configs_no_init, key, 1e-10)
return configs_no_init
class Blip2VisionModelTester:
def __init__(
- self,
- parent,
- batch_size=12,
- image_size=30,
- patch_size=2,
- num_channels=3,
- is_training=True,
- hidden_size=1408,
- projection_dim=32,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=37,
- dropout=0.1,
- attention_dropout=0.1,
- initializer_range=1e-10,
- scope=None, ):
+ self,
+ parent,
+ batch_size=12,
+ image_size=30,
+ patch_size=2,
+ num_channels=3,
+ is_training=True,
+ hidden_size=1408,
+ projection_dim=32,
+ num_hidden_layers=5,
+ num_attention_heads=4,
+ intermediate_size=37,
+ dropout=0.1,
+ attention_dropout=0.1,
+ initializer_range=1e-10,
+ scope=None,
+ ):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
@@ -81,13 +86,11 @@ def __init__(
self.scope = scope
# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
- num_patches = (image_size // patch_size)**2
+ num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
def prepare_config_and_inputs(self):
- pixel_values = floats_tensor([
- self.batch_size, self.num_channels, self.image_size, self.image_size
- ])
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
@@ -104,7 +107,8 @@ def get_config(self):
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
- initializer_range=self.initializer_range, )
+ initializer_range=self.initializer_range,
+ )
def create_and_check_model(self, config, pixel_values):
model = VisionTransformer(config=config)
@@ -114,13 +118,12 @@ def create_and_check_model(self, config, pixel_values):
# expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
image_size = (self.image_size, self.image_size)
patch_size = (self.patch_size, self.patch_size)
- num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
- patch_size[0])
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.parent.assertEqual(
result.last_hidden_state.shape,
- [self.batch_size, num_patches + 1, self.hidden_size], )
- self.parent.assertEqual(result.pooler_output.shape,
- [self.batch_size, self.hidden_size])
+ [self.batch_size, num_patches + 1, self.hidden_size],
+ )
+ self.parent.assertEqual(result.pooler_output.shape, [self.batch_size, self.hidden_size])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
@@ -135,7 +138,7 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
attention_mask and seq_length.
"""
- all_model_classes = (VisionTransformer, )
+ all_model_classes = (VisionTransformer,)
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
@@ -148,7 +151,8 @@ def setUp(self):
self,
config_class=Blip2VisionConfig,
has_text_modality=False,
- hidden_size=37, )
+ hidden_size=37,
+ )
def test_config(self):
self.config_tester.run_common_tests()
@@ -191,28 +195,29 @@ def test_model_from_pretrained(self):
class BertLMHeadModelTester:
def __init__(
- self,
- parent,
- batch_size=12,
- seq_length=7,
- is_training=True,
- use_input_mask=True,
- use_labels=True,
- vocab_size=99,
- hidden_size=768,
- projection_dim=32,
- num_hidden_layers=6,
- num_attention_heads=4,
- intermediate_size=37,
- dropout=0.1,
- attention_dropout=0.1,
- max_position_embeddings=512,
- initializer_range=0.02,
- bos_token_id=0,
- scope=None,
- num_patches=257,
- encoder_hidden_size=1408,
- encoder_width=1408, ):
+ self,
+ parent,
+ batch_size=12,
+ seq_length=7,
+ is_training=True,
+ use_input_mask=True,
+ use_labels=True,
+ vocab_size=99,
+ hidden_size=768,
+ projection_dim=32,
+ num_hidden_layers=6,
+ num_attention_heads=4,
+ intermediate_size=37,
+ dropout=0.1,
+ attention_dropout=0.1,
+ max_position_embeddings=512,
+ initializer_range=0.02,
+ bos_token_id=0,
+ scope=None,
+ num_patches=257,
+ encoder_hidden_size=1408,
+ encoder_width=1408,
+ ):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
@@ -236,12 +241,9 @@ def __init__(
self.encoder_width = encoder_width
def prepare_config_and_inputs(self):
- query_embeds = floats_tensor(
- [self.batch_size, self.seq_length, self.hidden_size])
- encoder_hidden_states = floats_tensor(
- [self.batch_size, self.num_patches, self.encoder_hidden_size])
- encoder_attention_mask = random_attention_mask(
- [self.batch_size, self.num_patches])
+ query_embeds = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+ encoder_hidden_states = floats_tensor([self.batch_size, self.num_patches, self.encoder_hidden_size])
+ encoder_attention_mask = random_attention_mask([self.batch_size, self.num_patches])
config = self.get_config()
return config, query_embeds, encoder_hidden_states, encoder_attention_mask
@@ -259,19 +261,21 @@ def get_config(self):
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
bos_token_id=self.bos_token_id,
- encoder_hidden_size=self.encoder_hidden_size, )
+ encoder_hidden_size=self.encoder_hidden_size,
+ )
- def create_and_check_model(self, config, query_embeds,
- encoder_hidden_states, encoder_attention_mask):
+ def create_and_check_model(self, config, query_embeds, encoder_hidden_states, encoder_attention_mask):
model = BertLMHeadModel(config=config, encoder_width=self.encoder_width)
model.eval()
result = model(
query_embeds=query_embeds,
encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask, )
+ encoder_attention_mask=encoder_attention_mask,
+ )
self.parent.assertEqual(
result.last_hidden_state.shape,
- [self.batch_size, self.seq_length, self.hidden_size], )
+ [self.batch_size, self.seq_length, self.hidden_size],
+ )
model = BertLMHeadModel(config=config)
model.eval()
@@ -279,11 +283,13 @@ def create_and_check_model(self, config, query_embeds,
result = model(
query_embeds,
encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask, )
+ encoder_attention_mask=encoder_attention_mask,
+ )
self.parent.assertEqual(
result.last_hidden_state.shape,
- [self.batch_size, self.seq_length, self.hidden_size], )
+ [self.batch_size, self.seq_length, self.hidden_size],
+ )
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
@@ -291,7 +297,8 @@ def prepare_config_and_inputs_for_common(self):
config,
query_embeds,
encoder_hidden_states,
- encoder_attention_mask, ) = config_and_inputs
+ encoder_attention_mask,
+ ) = config_and_inputs
inputs_dict = {
"query_embeds": query_embeds,
"encoder_hidden_states": encoder_hidden_states,
@@ -301,7 +308,7 @@ def prepare_config_and_inputs_for_common(self):
class BertLMHeadModelTest(ModelTesterMixin, unittest.TestCase):
- all_model_classes = (BertLMHeadModel, )
+ all_model_classes = (BertLMHeadModel,)
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
@@ -314,7 +321,8 @@ def setUp(self):
self,
config_class=Blip2QFormerConfig,
has_text_modality=False,
- hidden_size=37, )
+ hidden_size=37,
+ )
def test_config(self):
self.config_tester.run_common_tests()
@@ -337,28 +345,29 @@ def test_save_load(self):
class Blip2TextModelTester:
def __init__(
- self,
- parent,
- batch_size=12,
- seq_length=7,
- is_training=True,
- use_labels=False,
- vocab_size=99,
- hidden_size=16,
- num_hidden_layers=5,
- num_attention_heads=4,
- intermediate_size=4,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=20,
- eos_token_id=2,
- pad_token_id=1,
- bos_token_id=0,
- embed_dim=16,
- num_labels=3,
- word_embed_proj_dim=16,
- type_sequence_label_size=2, ):
+ self,
+ parent,
+ batch_size=12,
+ seq_length=7,
+ is_training=True,
+ use_labels=False,
+ vocab_size=99,
+ hidden_size=16,
+ num_hidden_layers=5,
+ num_attention_heads=4,
+ intermediate_size=4,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=20,
+ eos_token_id=2,
+ pad_token_id=1,
+ bos_token_id=0,
+ embed_dim=16,
+ num_labels=3,
+ word_embed_proj_dim=16,
+ type_sequence_label_size=2,
+ ):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
@@ -385,14 +394,12 @@ def __init__(
def prepare_config_and_inputs(self):
config = self.get_config()
- input_ids = ids_tensor(
- [self.batch_size, self.seq_length], self.vocab_size,
- dtype="int64").clip(3, )
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64").clip(
+ 3,
+ )
input_ids[:, -1] = self.eos_token_id # Eos Token
- attention_mask = input_ids.not_equal(
- paddle.to_tensor(
- [self.pad_token_id], dtype="int64")).cast("int64")
+ attention_mask = input_ids.not_equal(paddle.to_tensor([self.pad_token_id], dtype="int64")).cast("int64")
return config, input_ids, attention_mask
@@ -411,18 +418,20 @@ def get_config(self):
pad_token_id=self.pad_token_id,
embed_dim=self.embed_dim,
is_encoder_decoder=False,
- word_embed_proj_dim=self.word_embed_proj_dim, )
+ word_embed_proj_dim=self.word_embed_proj_dim,
+ )
class Blip2ModelTester:
def __init__(
- self,
- parent,
- vision_kwargs=None,
- qformer_kwargs=None,
- text_kwargs=None,
- is_training=True,
- num_query_tokens=10, ):
+ self,
+ parent,
+ vision_kwargs=None,
+ qformer_kwargs=None,
+ text_kwargs=None,
+ is_training=True,
+ num_query_tokens=10,
+ ):
if vision_kwargs is None:
vision_kwargs = {}
if qformer_kwargs is None:
@@ -431,10 +440,8 @@ def __init__(
text_kwargs = {}
self.parent = parent
- self.vision_model_tester = Blip2VisionModelTester(parent,
- **vision_kwargs)
- self.qformer_model_tester = BertLMHeadModelTester(parent,
- **qformer_kwargs)
+ self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
+ self.qformer_model_tester = BertLMHeadModelTester(parent, **qformer_kwargs)
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
self.is_training = is_training
self.num_query_tokens = num_query_tokens
@@ -456,16 +463,15 @@ def get_config(self):
vision_config=self.vision_model_tester.get_config(),
qformer_config=self.qformer_model_tester.get_config(),
text_config=self.text_model_tester.get_config(),
- num_query_tokens=self.num_query_tokens, )
+ num_query_tokens=self.num_query_tokens,
+ )
@unittest.skip(reason="BLIP-2's output needs to unified")
- def create_and_check_for_conditional_generation(
- self, config, input_ids, attention_mask, pixel_values):
+ def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
model = Blip2ForConditionalGeneration(config)
model.eval()
with paddle.no_grad():
- result = model(
- pixel_values, input_ids, attention_mask, return_dict=True)
+ result = model(pixel_values, input_ids, attention_mask, return_dict=True)
self.parent.assertEqual(
result.logits.shape,
@@ -473,7 +479,8 @@ def create_and_check_for_conditional_generation(
self.vision_model_tester.batch_size,
self.text_model_tester.seq_length + self.num_query_tokens,
self.text_model_tester.vocab_size,
- ], )
+ ],
+ )
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
@@ -481,7 +488,8 @@ def prepare_config_and_inputs_for_common(self):
config,
input_ids,
attention_mask,
- pixel_values, ) = config_and_inputs
+ pixel_values,
+ ) = config_and_inputs
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
@@ -491,7 +499,7 @@ def prepare_config_and_inputs_for_common(self):
class Blip2ModelTest(ModelTesterMixin, unittest.TestCase):
- all_model_classes = (Blip2ForConditionalGeneration, )
+ all_model_classes = (Blip2ForConditionalGeneration,)
fx_compatible = False
test_head_masking = False
test_pruning = False
@@ -505,16 +513,14 @@ def setUp(self):
def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
- self.model_tester.create_and_check_for_conditional_generation(
- *config_and_inputs)
+ self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
def test_determinism(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_determinism(first, second):
out_1 = first.numpy()
@@ -551,22 +557,19 @@ def test_forward_signature(self):
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_load_vision_qformer_text_config(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save Blip2Config and check if we can load Blip2VisionConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
- self.assertDictEqual(config.vision_config.to_dict(),
- vision_config.to_dict())
+ self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
# Save Blip2Config and check if we can load Blip2QFormerConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
- self.assertDictEqual(config.qformer_config.to_dict(),
- qformer_config.to_dict())
+ self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
@slow
def test_model_from_pretrained(self):
diff --git a/tests/models/test_configuration_common.py b/tests/models/test_configuration_common.py
index b014bbfe522ea..839941f706385 100644
--- a/tests/models/test_configuration_common.py
+++ b/tests/models/test_configuration_common.py
@@ -12,22 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import copy
import json
import os
import tempfile
-import unittest.mock as mock
-
-from paddlenlp.transformers.configuration_utils import PretrainedConfig
-from requests.exceptions import HTTPError
class ConfigTester(object):
- def __init__(self,
- parent,
- config_class=None,
- has_text_modality=True,
- **kwargs):
+ def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs):
self.parent = parent
self.config_class = config_class
self.has_text_modality = has_text_modality
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 43ae283d9149d..226caf803f84a 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -18,9 +18,7 @@
import os
import random
import shutil
-import subprocess
import tempfile
-import time
import unittest
from typing import Optional, Tuple, Type
@@ -36,8 +34,7 @@
def _config_zero_init(config):
configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys():
- if ("_range" in key or "_std" in key or "initializer_factor" in key or
- "layer_scale" in key):
+ if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
setattr(configs_no_init, key, 1e-10)
return configs_no_init
@@ -64,11 +61,8 @@ def floats_tensor(shape, scale=1.0):
return scale * paddle.randn(shape, dtype="float32")
-def check_two_model_parameter(first_model: PretrainedModel,
- second_model: PretrainedModel):
- assert (len(
- set(first_model.state_dict().keys()) - set(second_model.state_dict()
- .keys())) == 0)
+def check_two_model_parameter(first_model: PretrainedModel, second_model: PretrainedModel):
+ assert len(set(first_model.state_dict().keys()) - set(second_model.state_dict().keys())) == 0
# random choice the keys to compare
key = random.choice(list(first_model.state_dict().keys()))
@@ -106,8 +100,7 @@ def _make_model_instance(self, config, model_class):
return model_class(self.base_model_class(**config))
def test_save_load(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_save_load(out1, out2):
# make sure we don't have nans
@@ -123,16 +116,14 @@ def check_save_load(out1, out2):
model = self._make_model_instance(config, model_class)
model.eval()
with paddle.no_grad():
- first = model(**self._prepare_for_class(inputs_dict,
- model_class))[0]
+ first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname)
model.eval()
with paddle.no_grad():
- second = model(**self._prepare_for_class(inputs_dict,
- model_class))[0]
+ second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
# support tuple of tensor
if isinstance(first, tuple) and isinstance(second, tuple):
@@ -142,8 +133,7 @@ def check_save_load(out1, out2):
check_save_load(first, second)
def test_determinism(self):
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_determinism(first, second):
out_1 = first.numpy()
@@ -157,10 +147,8 @@ def check_determinism(first, second):
model = self._make_model_instance(config, model_class)
model.eval()
with paddle.no_grad():
- first = model(**self._prepare_for_class(inputs_dict,
- model_class))[0]
- second = model(**self._prepare_for_class(inputs_dict,
- model_class))[0]
+ first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+ second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
if isinstance(first, tuple) and isinstance(second, tuple):
for tensor1, tensor2 in zip(first, second):
@@ -190,30 +178,21 @@ def test_training_gradient_checkpointing(self):
def test_attention_outputs(self):
if not self.has_attentions:
return
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
seq_len = getattr(self.model_tester, "seq_length", None)
- decoder_seq_length = getattr(self.model_tester, "decoder_seq_length",
- seq_len)
- encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
- seq_len)
- decoder_key_length = getattr(self.model_tester, "decoder_key_length",
- decoder_seq_length)
- encoder_key_length = getattr(self.model_tester, "key_length",
- encoder_seq_length)
+ decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+ encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+ decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+ encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
chunk_length = getattr(self.model_tester, "chunk_length", None)
- if chunk_length is not None and hasattr(self.model_tester,
- "num_hashes"):
+ if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
for model_class in self.all_model_classes:
signature = inspect.signature(model_class.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
- if not all(
- name in arg_names
- for name in
- ["output_attentions", "output_hidden_states", "return_dict"]):
+ if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]):
continue
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
@@ -221,12 +200,9 @@ def test_attention_outputs(self):
model = self._make_model_instance(config, model_class)
model.eval()
with paddle.no_grad():
- outputs = model(**self._prepare_for_class(inputs_dict,
- model_class))
- attentions = (outputs.encoder_attentions
- if self.is_encoder_decoder else outputs.attentions)
- self.assertEqual(
- len(attentions), self.model_tester.num_hidden_layers)
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+ attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions
+ self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# TODO(guosheng): check that output_attentions also work using config
@@ -238,7 +214,8 @@ def test_attention_outputs(self):
encoder_seq_length,
chunk_length,
encoder_key_length,
- ], )
+ ],
+ )
else:
self.assertListEqual(
list(attentions[0].shape[-3:]),
@@ -246,7 +223,8 @@ def test_attention_outputs(self):
self.model_tester.num_attention_heads,
encoder_seq_length,
encoder_key_length,
- ], )
+ ],
+ )
out_len = len(outputs)
if self.is_encoder_decoder:
@@ -257,9 +235,7 @@ def test_attention_outputs(self):
correct_outlen += 1 # loss is added to beginning
# Question Answering model returns start_logits and end_logits
if model_class.__name__.endswith("ForQuestionAnswering"):
- correct_outlen += (
- 1 # start_logits and end_logits instead of only 1 output
- )
+ correct_outlen += 1 # start_logits and end_logits instead of only 1 output
if "past_key_values" in outputs:
correct_outlen += 1 # past_key_values have been returned
@@ -268,29 +244,28 @@ def test_attention_outputs(self):
# decoder attentions
decoder_attentions = outputs.decoder_attentions
self.assertIsInstance(decoder_attentions, (list, tuple))
- self.assertEqual(
- len(decoder_attentions),
- self.model_tester.num_hidden_layers)
+ self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[
self.model_tester.num_attention_heads,
decoder_seq_length,
decoder_key_length,
- ], )
+ ],
+ )
# cross attentions
cross_attentions = outputs.cross_attentions
self.assertIsInstance(cross_attentions, (list, tuple))
- self.assertEqual(
- len(cross_attentions), self.model_tester.num_hidden_layers)
+ self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(cross_attentions[0].shape[-3:]),
[
self.model_tester.num_attention_heads,
decoder_seq_length,
encoder_key_length,
- ], )
+ ],
+ )
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
@@ -298,8 +273,7 @@ def test_attention_outputs(self):
model = self._make_model_instance(config, model_class)
model.eval()
with paddle.no_grad():
- outputs = model(**self._prepare_for_class(inputs_dict,
- model_class))
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if hasattr(self.model_tester, "num_hidden_states_types"):
added_hidden_states = self.model_tester.num_hidden_states_types
@@ -309,11 +283,9 @@ def test_attention_outputs(self):
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
- self_attentions = (outputs.encoder_attentions if
- self.is_encoder_decoder else outputs.attentions)
+ self_attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions
- self.assertEqual(
- len(self_attentions), self.model_tester.num_hidden_layers)
+ self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
if chunk_length is not None:
self.assertListEqual(
list(self_attentions[0].shape[-4:]),
@@ -322,7 +294,8 @@ def test_attention_outputs(self):
encoder_seq_length,
chunk_length,
encoder_key_length,
- ], )
+ ],
+ )
else:
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
@@ -330,7 +303,8 @@ def test_attention_outputs(self):
self.model_tester.num_attention_heads,
encoder_seq_length,
encoder_key_length,
- ], )
+ ],
+ )
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
@@ -338,29 +312,28 @@ def check_hidden_states_output(inputs_dict, config, model_class):
model.eval()
with paddle.no_grad():
- outputs = model(**self._prepare_for_class(inputs_dict,
- model_class))
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class))
- hidden_states = (outputs.encoder_hidden_states if
- self.is_encoder_decoder else outputs.hidden_states)
+ hidden_states = outputs.encoder_hidden_states if self.is_encoder_decoder else outputs.hidden_states
expected_num_layers = getattr(
self.model_tester,
"expected_num_hidden_layers",
- self.model_tester.num_hidden_layers + 1, )
+ self.model_tester.num_hidden_layers + 1,
+ )
self.assertEqual(len(hidden_states), expected_num_layers)
if hasattr(self.model_tester, "encoder_seq_length"):
seq_length = self.model_tester.encoder_seq_length
- if (hasattr(self.model_tester, "chunk_length") and
- self.model_tester.chunk_length > 1):
+ if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
seq_length = seq_length * self.model_tester.chunk_length
else:
seq_length = self.model_tester.seq_length
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
- [seq_length, self.model_tester.hidden_size], )
+ [seq_length, self.model_tester.hidden_size],
+ )
if self.is_encoder_decoder:
hidden_states = outputs.decoder_hidden_states
@@ -368,24 +341,20 @@ def check_hidden_states_output(inputs_dict, config, model_class):
self.assertIsInstance(hidden_states, (list, tuple))
self.assertEqual(len(hidden_states), expected_num_layers)
seq_len = getattr(self.model_tester, "seq_length", None)
- decoder_seq_length = getattr(self.model_tester,
- "decoder_seq_length", seq_len)
+ decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
- [decoder_seq_length, self.model_tester.hidden_size], )
+ [decoder_seq_length, self.model_tester.hidden_size],
+ )
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
inputs_dict["return_dict"] = True
for model_class in self.all_model_classes:
signature = inspect.signature(model_class.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
- if not all(
- name in arg_names
- for name in
- ["output_attentions", "output_hidden_states", "return_dict"]):
+ if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]):
continue
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
@@ -417,7 +386,8 @@ def test_resize_position_vector_embeddings(self):
if self.is_encoder_decoder:
(
encoder_model_embed,
- decoder_model_embed, ) = model.get_position_embeddings()
+ decoder_model_embed,
+ ) = model.get_position_embeddings()
encoder_cloned_embeddings = encoder_model_embed.weight.clone()
decoder_cloned_embeddings = decoder_model_embed.weight.clone()
else:
@@ -427,24 +397,25 @@ def test_resize_position_vector_embeddings(self):
# Check that resizing the position embeddings with a larger max_position_embeddings increases
# the model's postion embeddings size
model.resize_position_embeddings(max_position_embeddings + 10)
- self.assertEqual(model.config.max_position_embeddings,
- max_position_embeddings + 10)
+ self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
# Check that it actually resizes the embeddings matrix
if model.config.is_encoder_decoder:
(
encoder_model_embed,
- decoder_model_embed, ) = model.get_position_embeddings()
+ decoder_model_embed,
+ ) = model.get_position_embeddings()
self.assertEqual(
encoder_model_embed.weight.shape[0],
- encoder_cloned_embeddings.shape[0] + 10, )
+ encoder_cloned_embeddings.shape[0] + 10,
+ )
self.assertEqual(
decoder_model_embed.weight.shape[0],
- decoder_cloned_embeddings.shape[0] + 10, )
+ decoder_cloned_embeddings.shape[0] + 10,
+ )
else:
model_embed = model.get_position_embeddings()
- self.assertEqual(model_embed.weight.shape[0],
- cloned_embeddings.shape[0] + 10)
+ self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@@ -454,23 +425,26 @@ def test_resize_position_vector_embeddings(self):
model.resize_position_embeddings(max_position_embeddings - 5)
self.assertEqual(
model.base_model.config["max_position_embeddings"],
- max_position_embeddings - 5, )
+ max_position_embeddings - 5,
+ )
# Check that it actually resizes the embeddings matrix
if self.is_encoder_decoder:
(
encoder_model_embed,
- decoder_model_embed, ) = model.get_position_embeddings()
+ decoder_model_embed,
+ ) = model.get_position_embeddings()
self.assertEqual(
encoder_model_embed.weight.shape[0],
- encoder_cloned_embeddings.shape[0] - 5, )
+ encoder_cloned_embeddings.shape[0] - 5,
+ )
self.assertEqual(
decoder_model_embed.weight.shape[0],
- decoder_cloned_embeddings.shape[0] - 5, )
+ decoder_cloned_embeddings.shape[0] - 5,
+ )
else:
model_embed = model.get_position_embeddings()
- self.assertEqual(model_embed.weight.shape[0],
- cloned_embeddings.shape[0] - 5)
+ self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@@ -479,12 +453,10 @@ def test_resize_position_vector_embeddings(self):
models_equal = True
if model.config.is_encoder_decoder:
- for p1, p2 in zip(encoder_cloned_embeddings,
- encoder_model_embed.weight):
+ for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
- for p1, p2 in zip(decoder_cloned_embeddings,
- decoder_model_embed.weight):
+ for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
else:
@@ -515,32 +487,27 @@ def test_resize_tokens_embeddings(self):
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
- self.assertEqual(model.base_model.config.vocab_size,
- model_vocab_size + 10)
+ self.assertEqual(model.base_model.config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
- self.assertEqual(model_embed.weight.shape[0],
- cloned_embeddings.shape[0] + 10)
+ self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
- self.assertEqual(model.base_model.config.vocab_size,
- model_vocab_size - 15)
+ self.assertEqual(model.base_model.config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
- self.assertEqual(model_embed.weight.shape[0],
- cloned_embeddings.shape[0] - 15)
+ self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary
- inputs_dict["input_ids"] = paddle.clip(
- inputs_dict["input_ids"], max=model_vocab_size - 15 - 1)
+ inputs_dict["input_ids"] = paddle.clip(inputs_dict["input_ids"], max=model_vocab_size - 15 - 1)
# make sure that decoder_input_ids are resized as well
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"] = paddle.clip(
- inputs_dict["decoder_input_ids"],
- max=model_vocab_size - 15 - 1)
+ inputs_dict["decoder_input_ids"], max=model_vocab_size - 15 - 1
+ )
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
@@ -566,15 +533,13 @@ def test_inputs_embeds(self):
if not self.use_test_inputs_embeds:
return
# get config for model and inputs_dict for model forward
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# test all model classes
for model_class in self.all_model_classes:
model = self._make_model_instance(config, model_class)
model.eval()
- inputs = copy.deepcopy(
- self._prepare_for_class(inputs_dict, model_class))
+ inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
with paddle.no_grad():
ids_output = model(**inputs)
@@ -584,8 +549,7 @@ def test_inputs_embeds(self):
del inputs["input_ids"]
else:
encoder_input_ids = inputs["input_ids"]
- decoder_input_ids = inputs.get("decoder_input_ids",
- encoder_input_ids)
+ decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
del inputs["input_ids"]
inputs.pop("decoder_input_ids", None)
@@ -616,8 +580,7 @@ def test_model_name_list(self):
self.assertTrue(len(model.model_name_list) != 0)
def test_pretrained_config_save_load(self):
- if (self.base_model_class is None or
- not self.base_model_class.constructed_from_pretrained_config()):
+ if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config():
return
config_class = self.base_model_class.config_class
@@ -627,23 +590,21 @@ def test_pretrained_config_save_load(self):
config.save_pretrained(tempdir)
# check the file exist
- self.assertFalse(
- os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME)))
+ self.assertFalse(os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME)))
self.assertTrue(os.path.exists(os.path.join(tempdir, CONFIG_NAME)))
# rename the CONFIG_NAME
shutil.move(
os.path.join(tempdir, CONFIG_NAME),
- os.path.join(tempdir, LEGACY_CONFIG_NAME), )
+ os.path.join(tempdir, LEGACY_CONFIG_NAME),
+ )
loaded_config = config.__class__.from_pretrained(tempdir)
for key in config.__dict__.keys():
- self.assertEqual(
- getattr(config, key), getattr(loaded_config, key))
+ self.assertEqual(getattr(config, key), getattr(loaded_config, key))
def random_choice_pretrained_config_field(self) -> Optional[str]:
- if (self.base_model_class is None or
- not self.base_model_class.constructed_from_pretrained_config()):
+ if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config():
return None
config = self.base_model_class.config_class()
@@ -652,21 +613,17 @@ def random_choice_pretrained_config_field(self) -> Optional[str]:
def test_for_missed_attribute(self):
if not self.test_model_compatibility_keys:
- self.skipTest(
- f"Do not test model_compatibility_keys on {self.base_model_class}"
- )
+ self.skipTest(f"Do not test model_compatibility_keys on {self.base_model_class}")
return
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if not model_class.constructed_from_pretrained_config():
continue
model = self._make_model_instance(config, model_class)
- all_maps: dict = copy.deepcopy(
- model_class.config_class.attribute_map)
+ all_maps: dict = copy.deepcopy(model_class.config_class.attribute_map)
for old_attribute, new_attribute in all_maps.items():
old_value = getattr(model.config, old_attribute)
@@ -683,11 +640,9 @@ def test_tie_weight(self):
if not self.test_tie_weights:
return
- config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
- )
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
- if ("CausalLM" not in model_class.__name__ and
- "MaskedLM" not in model_class.__name__):
+ if "CausalLM" not in model_class.__name__ and "MaskedLM" not in model_class.__name__:
continue
model = self._make_model_instance(config, model_class)
@@ -695,8 +650,7 @@ def test_tie_weight(self):
if not model.config.tie_word_embeddings:
continue
- if hasattr(model, "get_input_embeddings") and hasattr(
- model, "get_output_embeddings"):
+ if hasattr(model, "get_input_embeddings") and hasattr(model, "get_output_embeddings"):
try:
input_embeddings = model.get_input_embeddings()
except NotImplementedError:
@@ -719,14 +673,16 @@ def test_tie_weight(self):
input_embeddings_weight = input_embeddings
print(
input_embeddings_weight,
- output_embeddings_weight, )
- print("model name :{},id is{},{}".format(
- model_class,
- id(output_embeddings_weight),
- id(input_embeddings_weight), ))
- self.assertEqual(
- id(output_embeddings_weight),
- id(input_embeddings_weight))
+ output_embeddings_weight,
+ )
+ print(
+ "model name :{},id is{},{}".format(
+ model_class,
+ id(output_embeddings_weight),
+ id(input_embeddings_weight),
+ )
+ )
+ self.assertEqual(id(output_embeddings_weight), id(input_embeddings_weight))
class ModelTesterPretrainedMixin:
@@ -739,48 +695,42 @@ class ModelTesterPretrainedMixin:
def test_model_from_pretrained_hf_hub(self):
if self.hf_remote_test_model_path is None or self.base_model_class is None:
return
- model = self.base_model_class.from_pretrained(
- self.hf_remote_test_model_path, from_hf_hub=True)
+ model = self.base_model_class.from_pretrained(self.hf_remote_test_model_path, from_hf_hub=True)
self.assertIsNotNone(model)
def test_model_from_pretrained_paddle_hub(self):
- if (self.paddlehub_remote_test_model_path is None or
- self.base_model_class is None):
+ if self.paddlehub_remote_test_model_path is None or self.base_model_class is None:
return
- model = self.base_model_class.from_pretrained(
- self.paddlehub_remote_test_model_path)
+ model = self.base_model_class.from_pretrained(self.paddlehub_remote_test_model_path)
self.assertIsNotNone(model)
def test_model_from_config_paddle_hub(self):
- if (self.paddlehub_remote_test_model_path is None or
- self.base_model_class is None):
+ if self.paddlehub_remote_test_model_path is None or self.base_model_class is None:
return
- config = self.base_model_class.config_class.from_pretrained(
- self.paddlehub_remote_test_model_path)
+ config = self.base_model_class.config_class.from_pretrained(self.paddlehub_remote_test_model_path)
model = self.base_model_class._from_config(config)
self.assertIsNotNone(model)
@slow
def test_model_from_pretrained_with_cache_dir(self):
- for model_name in list(
- self.base_model_class.pretrained_init_configuration)[:1]:
+ for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = str(tempdir)
- model = self.base_model_class.from_pretrained(
- model_name, cache_dir=tempdir)
+ model = self.base_model_class.from_pretrained(model_name, cache_dir=tempdir)
self.assertIsNotNone(model)
self.assertTrue(
os.path.isfile(
os.path.join(
tempdir,
model_name,
- self.base_model_class.resource_files_names[
- "model_state"], )))
+ self.base_model_class.resource_files_names["model_state"],
+ )
+ )
+ )
self.assertTrue(
- os.path.isfile(
- os.path.join(tempdir, model_name,
- self.base_model_class.model_config_file)))
+ os.path.isfile(os.path.join(tempdir, model_name, self.base_model_class.model_config_file))
+ )
@slow
def test_pretrained_save_and_load(self):
@@ -788,8 +738,7 @@ def test_pretrained_save_and_load(self):
eg: `bert-base-uncased.pdparams` and `model_state.pdparams`
"""
- for model_name in list(
- self.base_model_class.pretrained_init_configuration)[:1]:
+ for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]:
model = self.base_model_class.from_pretrained(model_name)
self.assertIsNotNone(model)
@@ -798,8 +747,7 @@ def test_pretrained_save_and_load(self):
tempdirname = str(tempdir)
model.save_pretrained(tempdirname)
- loaded_model = self.base_model_class.from_pretrained(
- tempdirname)
+ loaded_model = self.base_model_class.from_pretrained(tempdirname)
check_two_model_parameter(model, loaded_model)
@@ -809,20 +757,20 @@ def test_pretrained_save_and_load(self):
shutil.copytree(
os.path.join(MODEL_HOME, model_name),
- tempdirname, )
+ tempdirname,
+ )
saved_model_state_file = os.path.join(
tempdirname,
- self.base_model_class.resource_files_names["model_state"], )
+ self.base_model_class.resource_files_names["model_state"],
+ )
self.assertTrue(os.path.isfile(saved_model_state_file))
# rename it to the old style: name of url, eg: model_state.pdparams -> bert-base-uncased.pdparams
- url = self.base_model_class.pretrained_resource_files_map[
- "model_state"][model_name]
+ url = self.base_model_class.pretrained_resource_files_map["model_state"][model_name]
pretrained_resource_file_name = os.path.split(url)[-1]
- target_file_path = os.path.join(tempdirname,
- pretrained_resource_file_name)
+ target_file_path = os.path.join(tempdirname, pretrained_resource_file_name)
shutil.copyfile(saved_model_state_file, target_file_path)
os.remove(saved_model_state_file)
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 27448810955d7..dbf16a00360e8 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -14,18 +14,10 @@
from __future__ import annotations
-import copy
-import gc
-import inspect
import os
-import sys
import unittest
from argparse import ArgumentTypeError
-import numpy as np
-import paddle
-import yaml
-
def strtobool(v):
if isinstance(v, bool):
@@ -47,9 +39,7 @@ def get_bool_from_env(key, default_value=False):
try:
value = strtobool(value)
except ValueError:
- raise ValueError(
- f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive)."
- )
+ raise ValueError(f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive).")
return value