Pins torch to >= 2.0 and applies recommendation for faster model lodi…

…ng (#279) This applies a [newly introduced context manager](huggingface/transformers#21913 (comment)) that skips the overhead of loading models into CPU by loading them directly into the GPU.
modal-labs · Apr 18, 2023 · ed47b21 · ed47b21
1 parent 96f336b
commit ed47b21
Showing 1 changed file with 16 additions and 14 deletions.
diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py
@@ -98,6 +98,7 @@ def download_models():
         "transformers",
         "triton",
         "safetensors",
+        "torch>=2.0",
     )
     .pip_install("xformers", pre=True)
     .run_function(
@@ -133,20 +134,21 @@ def __enter__(self):
 
         torch.backends.cuda.matmul.allow_tf32 = True
 
-        scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
-            cache_path,
-            subfolder="scheduler",
-            solver_order=2,
-            prediction_type="epsilon",
-            thresholding=False,
-            algorithm_type="dpmsolver++",
-            solver_type="midpoint",
-            denoise_final=True,  # important if steps are <= 10
-        )
-        self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
-            cache_path, scheduler=scheduler
-        ).to("cuda")
-        self.pipe.enable_xformers_memory_efficient_attention()
+        with torch.device("cuda"):
+            scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
+                cache_path,
+                subfolder="scheduler",
+                solver_order=2,
+                prediction_type="epsilon",
+                thresholding=False,
+                algorithm_type="dpmsolver++",
+                solver_type="midpoint",
+                denoise_final=True,  # important if steps are <= 10
+            )
+            self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
+                cache_path, scheduler=scheduler
+            ).to("cuda")
+            self.pipe.enable_xformers_memory_efficient_attention()
 
     @stub.function(gpu="A10G")
     def run_inference(