SD3 Image-to-Image and Inpainting (#7295)

## Summary Add support for SD3 image-to-image and inpainting. Similar to FLUX, the implementation supports fractional denoise_start/denoise_end for more fine-grained denoise strength control, and a gradient mask adjustment schedule for smoother inpainting seams. ## Example Workflow <img width="1016" alt="image" src="https://github.com/user-attachments/assets/ee598d77-be80-4ca7-9355-c3cbefa2ef43"> Result ![image](https://github.com/user-attachments/assets/43953fa7-0e4e-42b5-84e8-85cfeeeee00b) ## QA Instructions - [x] Regression test of text-to-image - [x] Test image-to-image without mask - [x] Test that adjusting denoising_start allows fine-grained control of amount of change in image-to-image - [x] Test inpainting with mask - [x] Smoke test SD1, SDXL, FLUX image-to-image to make sure there was no regression with the frontend changes. ## Merge Plan  ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [x] _Tests added / updated (if applicable)_ - [x] _Documentation added / updated (if applicable)_ - [ ] _Updated `What's New` copy (if doing a release after this PR)_
invoke-ai · Nov 14, 2024 · 0ba11e8 · 0ba11e8
2 parents 4f9d12b + 1cf7600
commit 0ba11e8
Show file tree

Hide file tree

Showing 36 changed files with 783 additions and 109 deletions.
diff --git a/invokeai/app/invocations/metadata.py b/invokeai/app/invocations/metadata.py
@@ -147,6 +147,10 @@ def invoke(self, context: InvocationContext) -> MetadataOutput:
     "flux_img2img",
     "flux_inpaint",
     "flux_outpaint",
+    "sd3_txt2img",
+    "sd3_img2img",
+    "sd3_inpaint",
+    "sd3_outpaint",
 ]
 
 

diff --git a/invokeai/app/invocations/sd3_denoise.py b/invokeai/app/invocations/sd3_denoise.py
@@ -1,16 +1,19 @@
-from typing import Callable, Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
+import torchvision.transforms as tv_transforms
 from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
-from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+from torchvision.transforms.functional import resize as tv_resize
 from tqdm import tqdm
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
 from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
+    DenoiseMaskField,
     FieldDescriptions,
     Input,
     InputField,
+    LatentsField,
     SD3ConditioningField,
     WithBoard,
     WithMetadata,
@@ -19,7 +22,9 @@
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.invocations.sd3_text_encoder import SD3_T5_MAX_SEQ_LEN
 from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.flux.sampling_utils import clip_timestep_schedule_fractional
 from invokeai.backend.model_manager.config import BaseModelType
+from invokeai.backend.sd3.extensions.inpaint_extension import InpaintExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import SD3ConditioningInfo
 from invokeai.backend.util.devices import TorchDevice
@@ -30,16 +35,24 @@
     title="SD3 Denoise",
     tags=["image", "sd3"],
     category="image",
-    version="1.0.0",
+    version="1.1.0",
     classification=Classification.Prototype,
 )
 class SD3DenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
     """Run denoising process with a SD3 model."""
 
+    # If latents is provided, this means we are doing image-to-image.
+    latents: Optional[LatentsField] = InputField(
+        default=None, description=FieldDescriptions.latents, input=Input.Connection
+    )
+    # denoise_mask is used for image-to-image inpainting. Only the masked region is modified.
+    denoise_mask: Optional[DenoiseMaskField] = InputField(
+        default=None, description=FieldDescriptions.denoise_mask, input=Input.Connection
+    )
+    denoising_start: float = InputField(default=0.0, ge=0, le=1, description=FieldDescriptions.denoising_start)
+    denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
     transformer: TransformerField = InputField(
-        description=FieldDescriptions.sd3_model,
-        input=Input.Connection,
-        title="Transformer",
+        description=FieldDescriptions.sd3_model, input=Input.Connection, title="Transformer"
     )
     positive_conditioning: SD3ConditioningField = InputField(
         description=FieldDescriptions.positive_cond, input=Input.Connection
@@ -61,6 +74,41 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
         name = context.tensors.save(tensor=latents)
         return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
 
+    def _prep_inpaint_mask(self, context: InvocationContext, latents: torch.Tensor) -> torch.Tensor | None:
+        """Prepare the inpaint mask.
+        - Loads the mask
+        - Resizes if necessary
+        - Casts to same device/dtype as latents
+
+        Args:
+            context (InvocationContext): The invocation context, for loading the inpaint mask.
+            latents (torch.Tensor): A latent image tensor. Used to determine the target shape, device, and dtype for the
+                inpaint mask.
+
+        Returns:
+            torch.Tensor | None: Inpaint mask. Values of 0.0 represent the regions to be fully denoised, and 1.0
+                represent the regions to be preserved.
+        """
+        if self.denoise_mask is None:
+            return None
+        mask = context.tensors.load(self.denoise_mask.mask_name)
+
+        # The input denoise_mask contains values in [0, 1], where 0.0 represents the regions to be fully denoised, and
+        # 1.0 represents the regions to be preserved.
+        # We invert the mask so that the regions to be preserved are 0.0 and the regions to be denoised are 1.0.
+        mask = 1.0 - mask
+
+        _, _, latent_height, latent_width = latents.shape
+        mask = tv_resize(
+            img=mask,
+            size=[latent_height, latent_width],
+            interpolation=tv_transforms.InterpolationMode.BILINEAR,
+            antialias=False,
+        )
+
+        mask = mask.to(device=latents.device, dtype=latents.dtype)
+        return mask
+
     def _load_text_conditioning(
         self,
         context: InvocationContext,
@@ -170,14 +218,20 @@ def _run_diffusion(
         prompt_embeds = torch.cat([neg_prompt_embeds, pos_prompt_embeds], dim=0)
         pooled_prompt_embeds = torch.cat([neg_pooled_prompt_embeds, pos_pooled_prompt_embeds], dim=0)
 
-        # Prepare the scheduler.
-        scheduler = FlowMatchEulerDiscreteScheduler()
-        scheduler.set_timesteps(num_inference_steps=self.steps, device=device)
-        timesteps = scheduler.timesteps
-        assert isinstance(timesteps, torch.Tensor)
+        # Prepare the timestep schedule.
+        # We add an extra step to the end to account for the final timestep of 0.0.
+        timesteps: list[float] = torch.linspace(1, 0, self.steps + 1).tolist()
+        # Clip the timesteps schedule based on denoising_start and denoising_end.
+        timesteps = clip_timestep_schedule_fractional(timesteps, self.denoising_start, self.denoising_end)
+        total_steps = len(timesteps) - 1
 
         # Prepare the CFG scale list.
-        cfg_scale = self._prepare_cfg_scale(len(timesteps))
+        cfg_scale = self._prepare_cfg_scale(total_steps)
+
+        # Load the input latents, if provided.
+        init_latents = context.tensors.load(self.latents.latents_name) if self.latents else None
+        if init_latents is not None:
+            init_latents = init_latents.to(device=device, dtype=inference_dtype)
 
         # Generate initial latent noise.
         num_channels_latents = transformer_info.model.config.in_channels
@@ -191,9 +245,34 @@ def _run_diffusion(
             device=device,
             seed=self.seed,
         )
-        latents: torch.Tensor = noise
 
-        total_steps = len(timesteps)
+        # Prepare input latent image.
+        if init_latents is not None:
+            # Noise the init_latents by the appropriate amount for the first timestep.
+            t_0 = timesteps[0]
+            latents = t_0 * noise + (1.0 - t_0) * init_latents
+        else:
+            # init_latents are not provided, so we are not doing image-to-image (i.e. we are starting from pure noise).
+            if self.denoising_start > 1e-5:
+                raise ValueError("denoising_start should be 0 when initial latents are not provided.")
+            latents = noise
+
+        # If len(timesteps) == 1, then short-circuit. We are just noising the input latents, but not taking any
+        # denoising steps.
+        if len(timesteps) <= 1:
+            return latents
+
+        # Prepare inpaint extension.
+        inpaint_mask = self._prep_inpaint_mask(context, latents)
+        inpaint_extension: InpaintExtension | None = None
+        if inpaint_mask is not None:
+            assert init_latents is not None
+            inpaint_extension = InpaintExtension(
+                init_latents=init_latents,
+                inpaint_mask=inpaint_mask,
+                noise=noise,
+            )
+
         step_callback = self._build_step_callback(context)
 
         step_callback(
@@ -210,11 +289,12 @@ def _run_diffusion(
             assert isinstance(transformer, SD3Transformer2DModel)
 
             # 6. Denoising loop
-            for step_idx, t in tqdm(list(enumerate(timesteps))):
+            for step_idx, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
                 # Expand the latents if we are doing CFG.
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 # Expand the timestep to match the latent model input.
-                timestep = t.expand(latent_model_input.shape[0])
+                # Multiply by 1000 to match the default FlowMatchEulerDiscreteScheduler num_train_timesteps.
+                timestep = torch.tensor([t_curr * 1000], device=device).expand(latent_model_input.shape[0])
 
                 noise_pred = transformer(
                     hidden_states=latent_model_input,
@@ -232,21 +312,19 @@ def _run_diffusion(
 
                 # Compute the previous noisy sample x_t -> x_t-1.
                 latents_dtype = latents.dtype
-                latents = scheduler.step(model_output=noise_pred, timestep=t, sample=latents, return_dict=False)[0]
+                latents = latents.to(dtype=torch.float32)
+                latents = latents + (t_prev - t_curr) * noise_pred
+                latents = latents.to(dtype=latents_dtype)
 
-                # TODO(ryand): This MPS dtype handling was copied from diffusers, I haven't tested to see if it's
-                # needed.
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
+                if inpaint_extension is not None:
+                    latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, t_prev)
 
                 step_callback(
                     PipelineIntermediateState(
                         step=step_idx + 1,
                         order=1,
                         total_steps=total_steps,
-                        timestep=int(t),
+                        timestep=int(t_curr),
                         latents=latents,
                     ),
                 )

diff --git a/invokeai/app/invocations/sd3_image_to_latents.py b/invokeai/app/invocations/sd3_image_to_latents.py
@@ -0,0 +1,65 @@
+import einops
+import torch
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+    WithBoard,
+    WithMetadata,
+)
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.load.load_base import LoadedModel
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation(
+    "sd3_i2l",
+    title="SD3 Image to Latents",
+    tags=["image", "latents", "vae", "i2l", "sd3"],
+    category="image",
+    version="1.0.0",
+    classification=Classification.Prototype,
+)
+class SD3ImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBoard):
+    """Generates latents from an image."""
+
+    image: ImageField = InputField(description="The image to encode")
+    vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection)
+
+    @staticmethod
+    def vae_encode(vae_info: LoadedModel, image_tensor: torch.Tensor) -> torch.Tensor:
+        with vae_info as vae:
+            assert isinstance(vae, AutoencoderKL)
+
+            vae.disable_tiling()
+
+            image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
+            with torch.inference_mode():
+                image_tensor_dist = vae.encode(image_tensor).latent_dist
+                # TODO: Use seed to make sampling reproducible.
+                latents: torch.Tensor = image_tensor_dist.sample().to(dtype=vae.dtype)
+
+            latents = vae.config.scaling_factor * latents
+
+        return latents
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        image = context.images.get_pil(self.image.image_name)
+
+        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+        if image_tensor.dim() == 3:
+            image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
+
+        vae_info = context.models.load(self.vae.vae)
+        latents = self.vae_encode(vae_info=vae_info, image_tensor=image_tensor)
+
+        latents = latents.to("cpu")
+        name = context.tensors.save(tensor=latents)
+        return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
diff --git a/invokeai/backend/sd3/__init__.py b/invokeai/backend/sd3/__init__.py
diff --git a/invokeai/backend/sd3/extensions/__init__.py b/invokeai/backend/sd3/extensions/__init__.py
diff --git a/invokeai/backend/sd3/extensions/inpaint_extension.py b/invokeai/backend/sd3/extensions/inpaint_extension.py
@@ -0,0 +1,58 @@
+import torch
+
+
+class InpaintExtension:
+    """A class for managing inpainting with SD3."""
+
+    def __init__(self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor, noise: torch.Tensor):
+        """Initialize InpaintExtension.
+
+        Args:
+            init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0).
+            inpaint_mask (torch.Tensor): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1 will be
+                re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
+                inpainted region with the background.
+            noise (torch.Tensor): The noise tensor used to noise the init_latents.
+        """
+        assert init_latents.dim() == inpaint_mask.dim() == noise.dim() == 4
+        assert init_latents.shape[-2:] == inpaint_mask.shape[-2:] == noise.shape[-2:]
+
+        self._init_latents = init_latents
+        self._inpaint_mask = inpaint_mask
+        self._noise = noise
+
+    def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
+        """Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
+        # As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
+        # 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
+        # number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
+        # threshold worked well.
+        # We use a small epsilon to avoid any potential issues with floating point precision.
+        eps = 1e-4
+        mask_gradient_t_cutoff = 0.5
+        if t_prev > mask_gradient_t_cutoff:
+            # Early in the denoising process, use the inpaint mask as-is.
+            return self._inpaint_mask
+        else:
+            # After the cut-off, promote all non-zero mask values to 1.0.
+            mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
+
+        return mask
+
+    def merge_intermediate_latents_with_init_latents(
+        self, intermediate_latents: torch.Tensor, t_prev: float
+    ) -> torch.Tensor:
+        """Merge the intermediate latents with the initial latents for the current timestep using the inpaint mask. I.e.
+        update the intermediate latents to keep the regions that are not being inpainted on the correct noise
+        trajectory.
+
+        This function should be called after each denoising step.
+        """
+
+        mask = self._apply_mask_gradient_adjustment(t_prev)
+
+        # Noise the init latents for the current timestep.
+        noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
+
+        # Merge the intermediate latents with the noised_init_latents using the inpaint_mask.
+        return intermediate_latents * mask + noised_init_latents * (1.0 - mask)
diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json
@@ -704,6 +704,8 @@
         "baseModel": "Base Model",
         "cancel": "Cancel",
         "clipEmbed": "CLIP Embed",
+        "clipLEmbed": "CLIP-L Embed",
+        "clipGEmbed": "CLIP-G Embed",
         "config": "Config",
         "convert": "Convert",
         "convertingModelBegin": "Converting Model. Please wait.",

diff --git a/...ntend/web/src/app/store/middleware/listenerMiddleware/listeners/enqueueRequestedLinear.ts b/...ntend/web/src/app/store/middleware/listenerMiddleware/listeners/enqueueRequestedLinear.ts
@@ -8,6 +8,7 @@ import { $canvasManager } from 'features/controlLayers/store/ephemeral';
 import { prepareLinearUIBatch } from 'features/nodes/util/graph/buildLinearBatchConfig';
 import { buildFLUXGraph } from 'features/nodes/util/graph/generation/buildFLUXGraph';
 import { buildSD1Graph } from 'features/nodes/util/graph/generation/buildSD1Graph';
+import { buildSD3Graph } from 'features/nodes/util/graph/generation/buildSD3Graph';
 import { buildSDXLGraph } from 'features/nodes/util/graph/generation/buildSDXLGraph';
 import type { Graph } from 'features/nodes/util/graph/generation/Graph';
 import { toast } from 'features/toast/toast';
@@ -34,8 +35,8 @@ export const addEnqueueRequestedLinear = (startAppListening: AppStartListening)
       let buildGraphResult: Result<
         {
           g: Graph;
-          noise: Invocation<'noise' | 'flux_denoise'>;
-          posCond: Invocation<'compel' | 'sdxl_compel_prompt' | 'flux_text_encoder'>;
+          noise: Invocation<'noise' | 'flux_denoise' | 'sd3_denoise'>;
+          posCond: Invocation<'compel' | 'sdxl_compel_prompt' | 'flux_text_encoder' | 'sd3_text_encoder'>;
         },
         Error
       >;
@@ -51,6 +52,9 @@ export const addEnqueueRequestedLinear = (startAppListening: AppStartListening)
         case `sd-2`:
           buildGraphResult = await withResultAsync(() => buildSD1Graph(state, manager));
           break;
+        case `sd-3`:
+          buildGraphResult = await withResultAsync(() => buildSD3Graph(state, manager));
+          break;
         case `flux`:
           buildGraphResult = await withResultAsync(() => buildFLUXGraph(state, manager));
           break;