diff --git a/README.md b/README.md
index dac3b3598aaf..4d1fd40ed0f2 100644
--- a/README.md
+++ b/README.md
@@ -71,10 +71,13 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from
```python
from diffusers import DiffusionPipeline
+from diffusers.utils.torch_utils import get_device
import torch
+device = get_device()
+
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
-pipeline.to("cuda")
+pipeline.to(device)
pipeline("An image of a squirrel in Picasso style").images[0]
```
@@ -82,15 +85,18 @@ You can also dig into the models and schedulers toolbox to build your own diffus
```python
from diffusers import DDPMScheduler, UNet2DModel
+from diffusers.utils.torch_utils import get_device
from PIL import Image
import torch
+device = get_device()
+
scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
-model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
+model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to(device)
scheduler.set_timesteps(50)
sample_size = model.config.sample_size
-noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
+noise = torch.randn((1, 3, sample_size, sample_size), device=device)
input = noise
for t in scheduler.timesteps:
diff --git a/docs/source/en/api/models/autoencoderkl_cogvideox.md b/docs/source/en/api/models/autoencoderkl_cogvideox.md
index 2c5411a0647c..7a3ee76c91b8 100644
--- a/docs/source/en/api/models/autoencoderkl_cogvideox.md
+++ b/docs/source/en/api/models/autoencoderkl_cogvideox.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import AutoencoderKLCogVideoX
+from diffusers.utils.torch_utils import get_device
-vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to(device)
```
## AutoencoderKLCogVideoX
diff --git a/docs/source/en/api/models/autoencoderkl_ltx_video.md b/docs/source/en/api/models/autoencoderkl_ltx_video.md
index 9c2384ca53a1..61edd75539a4 100644
--- a/docs/source/en/api/models/autoencoderkl_ltx_video.md
+++ b/docs/source/en/api/models/autoencoderkl_ltx_video.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import AutoencoderKLLTXVideo
+from diffusers.utils.torch_utils import get_device
-vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to(device)
```
## AutoencoderKLLTXVideo
diff --git a/docs/source/en/api/models/autoencoderkl_mochi.md b/docs/source/en/api/models/autoencoderkl_mochi.md
index fef6645a18fa..25b8b64cfdc6 100644
--- a/docs/source/en/api/models/autoencoderkl_mochi.md
+++ b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import AutoencoderKLMochi
+from diffusers.utils.torch_utils import get_device
-vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to(device)
```
## AutoencoderKLMochi
diff --git a/docs/source/en/api/models/cogvideox_transformer3d.md b/docs/source/en/api/models/cogvideox_transformer3d.md
index 5d50e5dca651..bb0e57788691 100644
--- a/docs/source/en/api/models/cogvideox_transformer3d.md
+++ b/docs/source/en/api/models/cogvideox_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import CogVideoXTransformer3DModel
+from diffusers.utils.torch_utils import get_device
-transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to(device)
```
## CogVideoXTransformer3DModel
diff --git a/docs/source/en/api/models/cogview4_transformer2d.md b/docs/source/en/api/models/cogview4_transformer2d.md
index e87fbc680968..b2bd127ebb1f 100644
--- a/docs/source/en/api/models/cogview4_transformer2d.md
+++ b/docs/source/en/api/models/cogview4_transformer2d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import CogView4Transformer2DModel
+from diffusers.utils.torch_utils import get_device
-transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
```
## CogView4Transformer2DModel
diff --git a/docs/source/en/api/models/consisid_transformer3d.md b/docs/source/en/api/models/consisid_transformer3d.md
index 0531d475d2fb..ba2fc0bd3ece 100644
--- a/docs/source/en/api/models/consisid_transformer3d.md
+++ b/docs/source/en/api/models/consisid_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import ConsisIDTransformer3DModel
+from diffusers.utils.torch_utils import get_device
-transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
```
## ConsisIDTransformer3DModel
diff --git a/docs/source/en/api/models/ltx_video_transformer3d.md b/docs/source/en/api/models/ltx_video_transformer3d.md
index 5a2a1af9d821..f5fb27e71640 100644
--- a/docs/source/en/api/models/ltx_video_transformer3d.md
+++ b/docs/source/en/api/models/ltx_video_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
```python
from diffusers import LTXVideoTransformer3DModel
+from diffusers.utils.torch_utils import get_device
-transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
```
## LTXVideoTransformer3DModel
diff --git a/docs/source/en/api/pipelines/consistency_models.md b/docs/source/en/api/pipelines/consistency_models.md
index 4f7b2f0fb501..64ab252c0e13 100644
--- a/docs/source/en/api/pipelines/consistency_models.md
+++ b/docs/source/en/api/pipelines/consistency_models.md
@@ -29,8 +29,9 @@ For an additional speed-up, use `torch.compile` to generate multiple images in <
```diff
import torch
from diffusers import ConsistencyModelPipeline
+ from diffusers.utils.torch_utils import get_device
- device = "cuda"
+ device = get_device()
# Load the cd_bedroom256_lpips checkpoint.
model_id_or_path = "openai/diffusers-cd_bedroom256_lpips"
pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md
index 64341ca4b918..578a6eb7641b 100644
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -102,12 +102,15 @@ out.save("image.png")
import torch
from diffusers import FluxFillPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup.png")
mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup_mask.png")
repo_id = "black-forest-labs/FLUX.1-Fill-dev"
-pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to(device)
image = pipe(
prompt="a white paper cup",
@@ -131,8 +134,11 @@ import torch
from controlnet_aux import CannyDetector
from diffusers import FluxControlPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to(device)
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -159,8 +165,11 @@ import torch
from controlnet_aux import CannyDetector
from diffusers import FluxControlPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -189,9 +198,12 @@ image.save("output.png")
import torch
from diffusers import FluxControlPipeline, FluxTransformer2DModel
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
from image_gen_aux import DepthPreprocessor
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to(device)
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -218,9 +230,12 @@ Depth Control is also possible with a LoRA variant of this condition. The usage
import torch
from diffusers import FluxControlPipeline, FluxTransformer2DModel
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
from image_gen_aux import DepthPreprocessor
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora")
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -251,9 +266,10 @@ image.save("output.png")
import torch
from diffusers import FluxPriorReduxPipeline, FluxPipeline
from diffusers.utils import load_image
-device = "cuda"
-dtype = torch.bfloat16
+from diffusers.utils.torch_utils import get_device
+device = get_device()
+dtype = torch.bfloat16
repo_redux = "black-forest-labs/FLUX.1-Redux-dev"
repo_base = "black-forest-labs/FLUX.1-dev"
@@ -284,11 +300,14 @@ Flux Kontext is a model that allows in-context control of the image generation p
import torch
from diffusers import FluxKontextPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = FluxKontextPipeline.from_pretrained(
"black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16
)
-pipe.to("cuda")
+pipe.to(device)
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png").convert("RGB")
prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
@@ -305,13 +324,16 @@ Flux Kontext comes with an integrity safety checker, which should be run after t
```python
from flux.content_filters import PixtralContentFilter
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
# ... pipeline invocation to generate images
-integrity_checker = PixtralContentFilter(torch.device("cuda"))
+integrity_checker = PixtralContentFilter(torch.device(device))
image_ = np.array(image) / 255.0
image_ = 2 * image_ - 1
-image_ = torch.from_numpy(image_).to("cuda", dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
+image_ = torch.from_numpy(image_).to(device, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
if integrity_checker.test_image(image_):
raise ValueError("Your image has been flagged. Choose another prompt/image or try again.")
```
@@ -371,10 +393,13 @@ An IP-Adapter lets you prompt Flux with images, in addition to the text prompt.
import torch
from diffusers import FluxPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_input.jpg").resize((1024, 1024))
@@ -411,7 +436,7 @@ Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeli
[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
-On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
+On accelerator devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
> [!TIP]
> It is possible to mix block and leaf-level offloading for different components in a pipeline.
@@ -420,6 +445,9 @@ On CUDA devices that support asynchronous data streaming, set `use_stream=True`
import torch
from diffusers import FluxPipeline
from diffusers.hooks import apply_group_offloading
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "black-forest-labs/FLUX.1-dev"
dtype = torch.bfloat16
@@ -432,27 +460,27 @@ apply_group_offloading(
pipe.transformer,
offload_type="leaf_level",
offload_device=torch.device("cpu"),
- onload_device=torch.device("cuda"),
+ onload_device=torch.device(device),
use_stream=True,
)
apply_group_offloading(
pipe.text_encoder,
offload_device=torch.device("cpu"),
- onload_device=torch.device("cuda"),
+ onload_device=torch.device(device),
offload_type="leaf_level",
use_stream=True,
)
apply_group_offloading(
pipe.text_encoder_2,
offload_device=torch.device("cpu"),
- onload_device=torch.device("cuda"),
+ onload_device=torch.device(device),
offload_type="leaf_level",
use_stream=True,
)
apply_group_offloading(
pipe.vae,
offload_device=torch.device("cpu"),
- onload_device=torch.device("cuda"),
+ onload_device=torch.device(device),
offload_type="leaf_level",
use_stream=True,
)
diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md
index 07e869ba95ae..d7d86ce81897 100644
--- a/docs/source/en/api/pipelines/hunyuandit.md
+++ b/docs/source/en/api/pipelines/hunyuandit.md
@@ -52,11 +52,14 @@ First, load the pipeline:
```python
from diffusers import HunyuanDiTPipeline
+from diffusers.utils.torch_utils import get_device
import torch
+device = get_device()
+
pipeline = HunyuanDiTPipeline.from_pretrained(
"Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
```
Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md
index 048f6c1de980..5a01f332cd20 100644
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -31,9 +31,12 @@ The abstract from the technical report is:
import torch
from diffusers import DPMSolverMultistepScheduler, KolorsPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = KolorsPipeline.from_pretrained("Kwai-Kolors/Kolors-diffusers", torch_dtype=torch.float16, variant="fp16")
-pipe.to("cuda")
+pipe.to(device)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
image = pipe(
diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md
index 9d4d12dd4e02..a2e11a0a1d53 100644
--- a/docs/source/en/api/pipelines/latte.md
+++ b/docs/source/en/api/pipelines/latte.md
@@ -41,10 +41,13 @@ First, load the pipeline:
```python
import torch
from diffusers import LattePipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipeline = LattePipeline.from_pretrained(
"maxin-cn/Latte-1", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
```
Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md
index f1260b07b077..897f70ad5ea7 100644
--- a/docs/source/en/api/pipelines/mochi.md
+++ b/docs/source/en/api/pipelines/mochi.md
@@ -81,6 +81,9 @@ The following example will download the full precision `mochi-1-preview` weights
import torch
from diffusers import MochiPipeline
from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview")
@@ -90,7 +93,7 @@ pipe.enable_vae_tiling()
prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k."
-with torch.autocast("cuda", torch.bfloat16, cache_enabled=False):
+with torch.autocast(device, torch.bfloat16, cache_enabled=False):
frames = pipe(prompt, num_frames=85).frames[0]
export_to_video(frames, "mochi.mp4", fps=30)
@@ -137,8 +140,11 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
from diffusers import MochiPipeline
from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
from diffusers.video_processor import VideoProcessor
+device = get_device()
+
pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", force_zeros_for_empty_prompt=True)
pipe.enable_vae_tiling()
pipe.enable_model_cpu_offload()
@@ -150,7 +156,7 @@ with torch.no_grad():
pipe.encode_prompt(prompt=prompt)
)
-with torch.autocast("cuda", torch.bfloat16):
+with torch.autocast(device, torch.bfloat16):
with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
frames = pipe(
prompt_embeds=prompt_embeds,
@@ -162,7 +168,7 @@ with torch.autocast("cuda", torch.bfloat16):
height=480,
width=848,
num_frames=163,
- generator=torch.Generator("cuda").manual_seed(0),
+ generator=torch.Generator(device).manual_seed(0),
output_type="latent",
return_dict=False,
)[0]
@@ -188,14 +194,17 @@ video = video_processor.postprocess_video(video)[0]
export_to_video(video, "mochi.mp4", fps=30)
```
-## Running inference with multiple GPUs
+## Running inference with multiple accelerators
-It is possible to split the large Mochi transformer across multiple GPUs using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two GPUs, each with 24GB of VRAM.
+It is possible to split the large Mochi transformer across multiple accelerators using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two accelerators, each with 24GB of VRAM.
```python
import torch
from diffusers import MochiPipeline, MochiTransformer3DModel
from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "genmo/mochi-1-preview"
transformer = MochiTransformer3DModel.from_pretrained(
@@ -209,7 +218,7 @@ pipe = MochiPipeline.from_pretrained(model_id, transformer=transformer)
pipe.enable_model_cpu_offload()
pipe.enable_vae_tiling()
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
+with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False):
frames = pipe(
prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
negative_prompt="",
@@ -219,7 +228,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals
num_inference_steps=50,
guidance_scale=4.5,
num_videos_per_prompt=1,
- generator=torch.Generator(device="cuda").manual_seed(0),
+ generator=torch.Generator(device=device).manual_seed(0),
max_sequence_length=256,
output_type="pil",
).frames[0]
@@ -239,6 +248,9 @@ Diffusers currently doesn't support using the FP8 scaled versions of the Mochi s
import torch
from diffusers import MochiPipeline, MochiTransformer3DModel
from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "genmo/mochi-1-preview"
@@ -250,7 +262,7 @@ pipe = MochiPipeline.from_pretrained(model_id, transformer=transformer)
pipe.enable_model_cpu_offload()
pipe.enable_vae_tiling()
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
+with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False):
frames = pipe(
prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
negative_prompt="",
@@ -260,7 +272,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals
num_inference_steps=50,
guidance_scale=4.5,
num_videos_per_prompt=1,
- generator=torch.Generator(device="cuda").manual_seed(0),
+ generator=torch.Generator(device=device).manual_seed(0),
max_sequence_length=256,
output_type="pil",
).frames[0]
diff --git a/docs/source/en/api/pipelines/omnigen.md b/docs/source/en/api/pipelines/omnigen.md
index 074e7b8f0115..8ca32866a422 100644
--- a/docs/source/en/api/pipelines/omnigen.md
+++ b/docs/source/en/api/pipelines/omnigen.md
@@ -36,9 +36,12 @@ First, load the pipeline:
```python
import torch
from diffusers import OmniGenPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = OmniGenPipeline.from_pretrained("Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
```
For text-to-image, pass a text prompt. By default, OmniGen generates a 1024x1024 image.
diff --git a/docs/source/en/api/pipelines/pixart_sigma.md b/docs/source/en/api/pipelines/pixart_sigma.md
index dded4ea2d771..ec513189b243 100644
--- a/docs/source/en/api/pipelines/pixart_sigma.md
+++ b/docs/source/en/api/pipelines/pixart_sigma.md
@@ -86,10 +86,13 @@ Since text embeddings have been computed, remove the `text_encoder` and `pipe` f
```python
import gc
+from diffusers.utils.torch_utils import get_device
def flush():
gc.collect()
- torch.cuda.empty_cache()
+ device = get_device
+ device_module = getattr(torch, device, torch.cuda)
+ device_module.empty_cache()
del text_encoder
del pipe
@@ -99,11 +102,13 @@ flush()
Then compute the latents with the prompt embeddings as inputs:
```python
+device = get_device()
+
pipe = PixArtSigmaPipeline.from_pretrained(
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
text_encoder=None,
torch_dtype=torch.float16,
-).to("cuda")
+).to(device)
latents = pipe(
negative_prompt=None,
diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md
index afdb3de5f447..3b7ac428450c 100644
--- a/docs/source/en/api/pipelines/qwenimage.md
+++ b/docs/source/en/api/pipelines/qwenimage.md
@@ -39,9 +39,12 @@ number of steps. Refer to the code snippet below:
```py
from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
-import torch
+from diffusers.utils.torch_utils import get_device
+import torch
import math
+device = get_device()
+
ckpt_id = "Qwen/Qwen-Image"
# From
@@ -65,7 +68,7 @@ scheduler_config = {
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
pipe = DiffusionPipeline.from_pretrained(
ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
pipe.load_lora_weights(
"lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors"
)
diff --git a/docs/source/en/api/pipelines/stable_unclip.md b/docs/source/en/api/pipelines/stable_unclip.md
index 5abb6028c4cb..4fcc97f26c5a 100644
--- a/docs/source/en/api/pipelines/stable_unclip.md
+++ b/docs/source/en/api/pipelines/stable_unclip.md
@@ -35,8 +35,11 @@ Stable unCLIP can be leveraged for text-to-image generation by pipelining it wit
import torch
from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
from diffusers.models import PriorTransformer
+from diffusers.utils.torch_utils import get_device
from transformers import CLIPTokenizer, CLIPTextModelWithProjection
+device = get_device()
+
prior_model_id = "kakaobrain/karlo-v1-alpha"
data_type = torch.float16
prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)
@@ -59,7 +62,7 @@ pipe = StableUnCLIPPipeline.from_pretrained(
prior_scheduler=prior_scheduler,
)
-pipe = pipe.to("cuda")
+pipe = pipe.to(device)
wave_prompt = "dramatic wave, the Oceans roar, Strong wave spiral across the oceans as the waves unfurl into roaring crests; perfect wave form; perfect wave shape; dramatic wave shape; wave shape unbelievable; wave; wave shape spectacular"
image = pipe(prompt=wave_prompt).images[0]
@@ -76,12 +79,15 @@ For text-to-image we use `stabilityai/stable-diffusion-2-1-unclip-small` as it w
```python
from diffusers import StableUnCLIPImg2ImgPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
import torch
+device = get_device()
+
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
)
-pipe = pipe.to("cuda")
+pipe = pipe.to(device)
url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png"
init_image = load_image(url)
diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md
index 5fe3789d8287..6169f62bc350 100644
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -47,10 +47,13 @@ To generate a video from prompt, run the following Python code:
```python
import torch
from diffusers import TextToVideoZeroPipeline
+from diffusers.utils.torch_utils import get_device
import imageio
+device = get_device()
+
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
prompt = "A panda is playing guitar on times square"
result = pipe(prompt=prompt).images
@@ -69,10 +72,13 @@ We can also generate longer videos by doing the processing in a chunk-by-chunk m
```python
import torch
from diffusers import TextToVideoZeroPipeline
+from diffusers.utils.torch_utils import get_device
import numpy as np
+device = get_device()
+
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
seed = 0
video_length = 24 #24 ÷ 4fps = 6 seconds
chunk_size = 8
@@ -81,7 +87,7 @@ prompt = "A panda is playing guitar on times square"
# Generate the video chunk-by-chunk
result = []
chunk_ids = np.arange(0, video_length, chunk_size - 1)
-generator = torch.Generator(device="cuda")
+generator = torch.Generator(device=device)
for i in range(len(chunk_ids)):
print(f"Processing chunk {i + 1} / {len(chunk_ids)}")
ch_start = chunk_ids[i]
@@ -106,11 +112,14 @@ In order to use the SDXL model when generating a video from prompt, use the `Tex
```python
import torch
from diffusers import TextToVideoZeroSDXLPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
+).to(device)
```
### Text-To-Video with Pose Control
@@ -144,19 +153,22 @@ To generate a video from prompt with additional pose control
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+ from diffusers.utils.torch_utils import get_device
+
+ device = get_device()
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
model_id, controlnet=controlnet, torch_dtype=torch.float16
- ).to("cuda")
+ ).to(device)
# Set the attention processor
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
# fix latents for all frames
- latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+ latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
prompt = "Darth Vader dancing in a desert"
result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
@@ -169,6 +181,9 @@ To generate a video from prompt with additional pose control
import torch
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+ from diffusers.utils.torch_utils import get_device
+
+ device = get_device()
controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0'
model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
@@ -176,14 +191,14 @@ To generate a video from prompt with additional pose control
controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
model_id, controlnet=controlnet, torch_dtype=torch.float16
- ).to('cuda')
+ ).to(device)
# Set the attention processor
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
# fix latents for all frames
- latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+ latents = torch.randn((1, 4, 128, 128), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
prompt = "Darth Vader dancing in a desert"
result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
@@ -224,9 +239,12 @@ To perform text-guided video editing (with [InstructPix2Pix](pix2pix)):
import torch
from diffusers import StableDiffusionInstructPix2PixPipeline
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+ from diffusers.utils.torch_utils import get_device
+
+ device = get_device()
model_id = "timbrooks/instruct-pix2pix"
- pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+ pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3))
prompt = "make it Van Gogh Starry Night style"
@@ -267,20 +285,23 @@ can run with custom [DreamBooth](../../training/dreambooth) models, as shown bel
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+ from diffusers.utils.torch_utils import get_device
+
+ device = get_device()
# set model id to custom model
model_id = "PAIR/text2video-zero-controlnet-canny-avatar"
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
model_id, controlnet=controlnet, torch_dtype=torch.float16
- ).to("cuda")
+ ).to(device)
# Set the attention processor
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
# fix latents for all frames
- latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1)
+ latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1)
prompt = "oil painting of a beautiful girl avatar style"
result = pipe(prompt=[prompt] * len(canny_edges), image=canny_edges, latents=latents).images
diff --git a/docs/source/en/api/pipelines/unidiffuser.md b/docs/source/en/api/pipelines/unidiffuser.md
index 7d767f2db530..738b4e166f71 100644
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -47,8 +47,9 @@ Unconditional generation (where we start from only latents sampled from a standa
import torch
from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
-device = "cuda"
+device = get_device()
model_id_or_path = "thu-ml/unidiffuser-v1"
pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)
@@ -96,8 +97,9 @@ Here is an example of sampling from the conditional image distribution (text-to-
import torch
from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
-device = "cuda"
+device = get_device()
model_id_or_path = "thu-ml/unidiffuser-v1"
pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)
@@ -121,8 +123,9 @@ import torch
from diffusers import UniDiffuserPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
-device = "cuda"
+device = get_device()
model_id_or_path = "thu-ml/unidiffuser-v1"
pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)
@@ -148,8 +151,9 @@ import torch
from diffusers import UniDiffuserPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
-device = "cuda"
+device = get_device()
model_id_or_path = "thu-ml/unidiffuser-v1"
pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)
@@ -177,8 +181,9 @@ Similarly, text variation can be performed on an input prompt with a text-to-ima
import torch
from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
-device = "cuda"
+device = get_device()
model_id_or_path = "thu-ml/unidiffuser-v1"
pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)
diff --git a/docs/source/en/api/pipelines/visualcloze.md b/docs/source/en/api/pipelines/visualcloze.md
index 1a4f96a50d63..a3c55b193ea4 100644
--- a/docs/source/en/api/pipelines/visualcloze.md
+++ b/docs/source/en/api/pipelines/visualcloze.md
@@ -68,9 +68,12 @@ For comprehensive examples covering a wide range of tasks, please refer to the [
import torch
from diffusers import VisualClozePipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
# Load in-context images (make sure the paths are correct and accessible)
image_paths = [
@@ -120,9 +123,12 @@ image_result.save("visualcloze.png")
import torch
from diffusers import VisualClozePipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
# Load in-context images (make sure the paths are correct and accessible)
image_paths = [
@@ -170,9 +176,12 @@ image_result.save("visualcloze.png")
import torch
from diffusers import VisualClozePipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
# Load in-context images (make sure the paths are correct and accessible)
image_paths = [
@@ -226,12 +235,15 @@ image_result.save("visualcloze.png")
import torch
from diffusers import VisualClozeGenerationPipeline, FluxFillPipeline as VisualClozeUpsamplingPipeline
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
from PIL import Image
+device = get_device()
+
pipe = VisualClozeGenerationPipeline.from_pretrained(
"VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16
)
-pipe.to("cuda")
+pipe.to(device)
image_paths = [
# in-context examples
@@ -267,7 +279,7 @@ image = pipe(
# Stage 2 (optional): Upsample the generated image
pipe_upsample = VisualClozeUpsamplingPipeline.from_pipe(pipe)
-pipe_upsample.to("cuda")
+pipe_upsample.to(device)
mask_image = Image.new("RGB", image.size, (255, 255, 255))
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
index e46aa55ad82a..ba912d55ab08 100644
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -63,14 +63,17 @@ from diffusers import AutoModel, WanPipeline
from diffusers.quantizers import PipelineQuantizationConfig
from diffusers.hooks.group_offloading import apply_group_offloading
from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
from transformers import UMT5EncoderModel
+device = get_device()
+
text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
# group-offloading
-onload_device = torch.device("cuda")
+onload_device = torch.device(device)
offload_device = torch.device("cpu")
apply_group_offloading(text_encoder,
onload_device=onload_device,
@@ -92,7 +95,7 @@ pipeline = WanPipeline.from_pretrained(
text_encoder=text_encoder,
torch_dtype=torch.bfloat16
)
-pipeline.to("cuda")
+pipeline.to(device)
prompt = """
The camera rushes from far to near in a low-angle shot,
@@ -128,8 +131,11 @@ import numpy as np
from diffusers import AutoModel, WanPipeline
from diffusers.hooks.group_offloading import apply_group_offloading
from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
from transformers import UMT5EncoderModel
+device = get_device()
+
text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
@@ -141,7 +147,7 @@ pipeline = WanPipeline.from_pretrained(
text_encoder=text_encoder,
torch_dtype=torch.bfloat16
)
-pipeline.to("cuda")
+pipeline.to(device)
# torch.compile
pipeline.transformer.to(memory_format=torch.channels_last)
@@ -187,8 +193,10 @@ import torch
import torchvision.transforms.functional as TF
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
from transformers import CLIPVisionModel
+device = get_device()
model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
@@ -196,7 +204,7 @@ vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=to
pipe = WanImageToVideoPipeline.from_pretrained(
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
)
-pipe.to("cuda")
+pipe.to(device)
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
@@ -262,6 +270,9 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
from diffusers import AutoModel, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video
+ from diffusers.utils.torch_utils import get_device
+
+ device = get_device()
vae = AutoModel.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
@@ -272,7 +283,7 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
pipeline.scheduler = UniPCMultistepScheduler.from_config(
pipeline.scheduler.config, flow_shift=5.0
)
- pipeline.to("cuda")
+ pipeline.to(device)
pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie")
pipeline.set_adapters("steamboat-willie")
@@ -359,4 +370,4 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
## WanPipelineOutput
-[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
\ No newline at end of file
+[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index d322d76be267..564c05131f5d 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -31,6 +31,9 @@ Now you can quantize a model by passing the `QuantoConfig` object to the `from_p
```python
import torch
from diffusers import FluxTransformer2DModel, QuantoConfig
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "black-forest-labs/FLUX.1-dev"
quantization_config = QuantoConfig(weights_dtype="float8")
@@ -42,7 +45,7 @@ transformer = FluxTransformer2DModel.from_pretrained(
)
pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype)
-pipe.to("cuda")
+pipe.to(device)
prompt = "A cat holding a sign that says hello world"
image = pipe(
@@ -117,6 +120,9 @@ Currently the Quanto backend supports `torch.compile` for the following quantiza
```python
import torch
from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
model_id = "black-forest-labs/FLUX.1-dev"
quantization_config = QuantoConfig(weights_dtype="int8")
@@ -131,7 +137,7 @@ transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
pipe = FluxPipeline.from_pretrained(
model_id, transformer=transformer, torch_dtype=torch_dtype
)
-pipe.to("cuda")
+pipe.to(device)
images = pipe("A cat holding a sign that says hello").images[0]
images.save("flux-quanto-compile.png")
```
diff --git a/docs/source/en/using-diffusers/controlnet.md b/docs/source/en/using-diffusers/controlnet.md
index 4aa5abd04f3b..f376b56a48d2 100644
--- a/docs/source/en/using-diffusers/controlnet.md
+++ b/docs/source/en/using-diffusers/controlnet.md
@@ -52,14 +52,17 @@ Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` pa
```py
import torch
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
from diffusers import FluxControlNetPipeline, FluxControlNetModel
+device = get_device()
+
controlnet = FluxControlNetModel.from_pretrained(
"InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16
)
pipeline = FluxControlNetPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
prompt = """
A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita.
@@ -103,14 +106,16 @@ from PIL import Image
from transformers import DPTImageProcessor, DPTForDepthEstimation
from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+device = get_device()
-depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
def get_depth_map(image):
- image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
- with torch.no_grad(), torch.autocast("cuda"):
+ image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
+ with torch.no_grad(), torch.autocast(device):
depth_map = depth_estimator(image).predicted_depth
depth_map = torch.nn.functional.interpolate(
@@ -143,7 +148,7 @@ pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
controlnet=controlnet,
vae=vae,
torch_dtype=torch.float16,
-).to("cuda")
+).to(device)
prompt = """
A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita.
@@ -260,6 +265,9 @@ Pass the ControlNets as a list to the pipeline and resize the images to the expe
```py
import torch
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
controlnets = [
ControlNetModel.from_pretrained(
@@ -273,7 +281,7 @@ controlnets = [
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16
-).to("cuda")
+).to(device)
prompt = """
a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby,
@@ -316,6 +324,9 @@ pipeline(
import torch
from diffusers.utils import load_iamge
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
@@ -324,7 +335,7 @@ pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=controlnet,
torch_dtype=torch.float16
-).to("cuda")
+).to(device)
canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png")
pipeline(
@@ -343,4 +354,4 @@ pipeline(