diff --git a/README.md b/README.md
index dac3b3598aaf..4d1fd40ed0f2 100644
--- a/README.md
+++ b/README.md
@@ -71,10 +71,13 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from
 
 ```python
 from diffusers import DiffusionPipeline
+from diffusers.utils.torch_utils import get_device
 import torch
 
+device = get_device()
+
 pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
-pipeline.to("cuda")
+pipeline.to(device)
 pipeline("An image of a squirrel in Picasso style").images[0]
 ```
 
@@ -82,15 +85,18 @@ You can also dig into the models and schedulers toolbox to build your own diffus
 
 ```python
 from diffusers import DDPMScheduler, UNet2DModel
+from diffusers.utils.torch_utils import get_device
 from PIL import Image
 import torch
 
+device = get_device()
+
 scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
-model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
+model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to(device)
 scheduler.set_timesteps(50)
 
 sample_size = model.config.sample_size
-noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
+noise = torch.randn((1, 3, sample_size, sample_size), device=device)
 input = noise
 
 for t in scheduler.timesteps:
diff --git a/docs/source/en/api/models/autoencoderkl_cogvideox.md b/docs/source/en/api/models/autoencoderkl_cogvideox.md
index 2c5411a0647c..7a3ee76c91b8 100644
--- a/docs/source/en/api/models/autoencoderkl_cogvideox.md
+++ b/docs/source/en/api/models/autoencoderkl_cogvideox.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLCogVideoX
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to(device)
 ```
 
 ## AutoencoderKLCogVideoX
diff --git a/docs/source/en/api/models/autoencoderkl_ltx_video.md b/docs/source/en/api/models/autoencoderkl_ltx_video.md
index 9c2384ca53a1..61edd75539a4 100644
--- a/docs/source/en/api/models/autoencoderkl_ltx_video.md
+++ b/docs/source/en/api/models/autoencoderkl_ltx_video.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLLTXVideo
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to(device)
 ```
 
 ## AutoencoderKLLTXVideo
diff --git a/docs/source/en/api/models/autoencoderkl_mochi.md b/docs/source/en/api/models/autoencoderkl_mochi.md
index fef6645a18fa..25b8b64cfdc6 100644
--- a/docs/source/en/api/models/autoencoderkl_mochi.md
+++ b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLMochi
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to(device)
 ```
 
 ## AutoencoderKLMochi
diff --git a/docs/source/en/api/models/cogvideox_transformer3d.md b/docs/source/en/api/models/cogvideox_transformer3d.md
index 5d50e5dca651..bb0e57788691 100644
--- a/docs/source/en/api/models/cogvideox_transformer3d.md
+++ b/docs/source/en/api/models/cogvideox_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import CogVideoXTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to(device)
 ```
 
 ## CogVideoXTransformer3DModel
diff --git a/docs/source/en/api/models/cogview4_transformer2d.md b/docs/source/en/api/models/cogview4_transformer2d.md
index e87fbc680968..b2bd127ebb1f 100644
--- a/docs/source/en/api/models/cogview4_transformer2d.md
+++ b/docs/source/en/api/models/cogview4_transformer2d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import CogView4Transformer2DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## CogView4Transformer2DModel
diff --git a/docs/source/en/api/models/consisid_transformer3d.md b/docs/source/en/api/models/consisid_transformer3d.md
index 0531d475d2fb..ba2fc0bd3ece 100644
--- a/docs/source/en/api/models/consisid_transformer3d.md
+++ b/docs/source/en/api/models/consisid_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import ConsisIDTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## ConsisIDTransformer3DModel
diff --git a/docs/source/en/api/models/ltx_video_transformer3d.md b/docs/source/en/api/models/ltx_video_transformer3d.md
index 5a2a1af9d821..f5fb27e71640 100644
--- a/docs/source/en/api/models/ltx_video_transformer3d.md
+++ b/docs/source/en/api/models/ltx_video_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import LTXVideoTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## LTXVideoTransformer3DModel
diff --git a/docs/source/en/api/pipelines/consistency_models.md b/docs/source/en/api/pipelines/consistency_models.md
index 4f7b2f0fb501..64ab252c0e13 100644
--- a/docs/source/en/api/pipelines/consistency_models.md
+++ b/docs/source/en/api/pipelines/consistency_models.md
@@ -29,8 +29,9 @@ For an additional speed-up, use `torch.compile` to generate multiple images in <
 ```diff
   import torch
   from diffusers import ConsistencyModelPipeline
+  from diffusers.utils.torch_utils import get_device
 
-  device = "cuda"
+  device = get_device()
   # Load the cd_bedroom256_lpips checkpoint.
   model_id_or_path = "openai/diffusers-cd_bedroom256_lpips"
   pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md
index 64341ca4b918..578a6eb7641b 100644
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -102,12 +102,15 @@ out.save("image.png")
 import torch
 from diffusers import FluxFillPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup.png")
 mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup_mask.png")
 
 repo_id = "black-forest-labs/FLUX.1-Fill-dev"
-pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to(device)
 
 image = pipe(
     prompt="a white paper cup",
@@ -131,8 +134,11 @@ import torch
 from controlnet_aux import CannyDetector
 from diffusers import FluxControlPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to(device)
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
 control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -159,8 +165,11 @@ import torch
 from controlnet_aux import CannyDetector
 from diffusers import FluxControlPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
 pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -189,9 +198,12 @@ image.save("output.png")
 import torch
 from diffusers import FluxControlPipeline, FluxTransformer2DModel
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from image_gen_aux import DepthPreprocessor
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to(device)
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
 control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -218,9 +230,12 @@ Depth Control is also possible with a LoRA variant of this condition. The usage
 import torch
 from diffusers import FluxControlPipeline, FluxTransformer2DModel
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from image_gen_aux import DepthPreprocessor
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
 pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora")
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -251,9 +266,10 @@ image.save("output.png")
 import torch
 from diffusers import FluxPriorReduxPipeline, FluxPipeline
 from diffusers.utils import load_image
-device = "cuda"
-dtype = torch.bfloat16
+from diffusers.utils.torch_utils import get_device
 
+device = get_device()
+dtype = torch.bfloat16
 
 repo_redux = "black-forest-labs/FLUX.1-Redux-dev"
 repo_base = "black-forest-labs/FLUX.1-dev" 
@@ -284,11 +300,14 @@ Flux Kontext is a model that allows in-context control of the image generation p
 import torch
 from diffusers import FluxKontextPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = FluxKontextPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png").convert("RGB")
 prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
@@ -305,13 +324,16 @@ Flux Kontext comes with an integrity safety checker, which should be run after t
 
 ```python
 from flux.content_filters import PixtralContentFilter
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 # ... pipeline invocation to generate images
 
-integrity_checker = PixtralContentFilter(torch.device("cuda"))
+integrity_checker = PixtralContentFilter(torch.device(device))
 image_ = np.array(image) / 255.0
 image_ = 2 * image_ - 1
-image_ = torch.from_numpy(image_).to("cuda", dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
+image_ = torch.from_numpy(image_).to(device, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
 if integrity_checker.test_image(image_):
     raise ValueError("Your image has been flagged. Choose another prompt/image or try again.")
 ```
@@ -371,10 +393,13 @@ An IP-Adapter lets you prompt Flux with images, in addition to the text prompt.
 import torch
 from diffusers import FluxPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = FluxPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
 
 image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_input.jpg").resize((1024, 1024))
 
@@ -411,7 +436,7 @@ Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeli
 
 [Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
 
-On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
+On accelerator devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
 
 > [!TIP]
 > It is possible to mix block and leaf-level offloading for different components in a pipeline.
@@ -420,6 +445,9 @@ On CUDA devices that support asynchronous data streaming, set `use_stream=True`
 import torch
 from diffusers import FluxPipeline
 from diffusers.hooks import apply_group_offloading
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "black-forest-labs/FLUX.1-dev"
 dtype = torch.bfloat16
@@ -432,27 +460,27 @@ apply_group_offloading(
     pipe.transformer,
     offload_type="leaf_level",
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     use_stream=True,
 )
 apply_group_offloading(
     pipe.text_encoder, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )
 apply_group_offloading(
     pipe.text_encoder_2, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )
 apply_group_offloading(
     pipe.vae, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )
diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md
index 07e869ba95ae..d7d86ce81897 100644
--- a/docs/source/en/api/pipelines/hunyuandit.md
+++ b/docs/source/en/api/pipelines/hunyuandit.md
@@ -52,11 +52,14 @@ First, load the pipeline:
 
 ```python
 from diffusers import HunyuanDiTPipeline
+from diffusers.utils.torch_utils import get_device
 import torch
 
+device = get_device()
+
 pipeline = HunyuanDiTPipeline.from_pretrained(
 	"Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 ```
 
 Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md
index 048f6c1de980..5a01f332cd20 100644
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -31,9 +31,12 @@ The abstract from the technical report is:
 import torch
 
 from diffusers import DPMSolverMultistepScheduler, KolorsPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = KolorsPipeline.from_pretrained("Kwai-Kolors/Kolors-diffusers", torch_dtype=torch.float16, variant="fp16")
-pipe.to("cuda")
+pipe.to(device)
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
 
 image = pipe(
diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md
index 9d4d12dd4e02..a2e11a0a1d53 100644
--- a/docs/source/en/api/pipelines/latte.md
+++ b/docs/source/en/api/pipelines/latte.md
@@ -41,10 +41,13 @@ First, load the pipeline:
 ```python
 import torch
 from diffusers import LattePipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = LattePipeline.from_pretrained(
 	"maxin-cn/Latte-1", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 ```
 
 Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md
index f1260b07b077..897f70ad5ea7 100644
--- a/docs/source/en/api/pipelines/mochi.md
+++ b/docs/source/en/api/pipelines/mochi.md
@@ -81,6 +81,9 @@ The following example will download the full precision `mochi-1-preview` weights
 import torch
 from diffusers import MochiPipeline
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview")
 
@@ -90,7 +93,7 @@ pipe.enable_vae_tiling()
 
 prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k."
 
-with torch.autocast("cuda", torch.bfloat16, cache_enabled=False):
+with torch.autocast(device, torch.bfloat16, cache_enabled=False):
       frames = pipe(prompt, num_frames=85).frames[0]
 
 export_to_video(frames, "mochi.mp4", fps=30)
@@ -137,8 +140,11 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
 
 from diffusers import MochiPipeline
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
 from diffusers.video_processor import VideoProcessor
 
+device = get_device()
+
 pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", force_zeros_for_empty_prompt=True)
 pipe.enable_vae_tiling()
 pipe.enable_model_cpu_offload()
@@ -150,7 +156,7 @@ with torch.no_grad():
         pipe.encode_prompt(prompt=prompt)
     )
 
-with torch.autocast("cuda", torch.bfloat16):
+with torch.autocast(device, torch.bfloat16):
     with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
         frames = pipe(
             prompt_embeds=prompt_embeds,
@@ -162,7 +168,7 @@ with torch.autocast("cuda", torch.bfloat16):
             height=480,
             width=848,
             num_frames=163,
-            generator=torch.Generator("cuda").manual_seed(0),
+            generator=torch.Generator(device).manual_seed(0),
             output_type="latent",
             return_dict=False,
         )[0]
@@ -188,14 +194,17 @@ video = video_processor.postprocess_video(video)[0]
 export_to_video(video, "mochi.mp4", fps=30)
 ```
 
-## Running inference with multiple GPUs
+## Running inference with multiple accelerators
 
-It is possible to split the large Mochi transformer across multiple GPUs using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two GPUs, each with 24GB of VRAM.
+It is possible to split the large Mochi transformer across multiple accelerators using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two accelerators, each with 24GB of VRAM.
 
 ```python
 import torch
 from diffusers import MochiPipeline, MochiTransformer3DModel
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "genmo/mochi-1-preview"
 transformer = MochiTransformer3DModel.from_pretrained(
@@ -209,7 +218,7 @@ pipe = MochiPipeline.from_pretrained(model_id,  transformer=transformer)
 pipe.enable_model_cpu_offload()
 pipe.enable_vae_tiling()
 
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
+with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False):
     frames = pipe(
         prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
         negative_prompt="",
@@ -219,7 +228,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals
         num_inference_steps=50,
         guidance_scale=4.5,
         num_videos_per_prompt=1,
-        generator=torch.Generator(device="cuda").manual_seed(0),
+        generator=torch.Generator(device=device).manual_seed(0),
         max_sequence_length=256,
         output_type="pil",
     ).frames[0]
@@ -239,6 +248,9 @@ Diffusers currently doesn't support using the FP8 scaled versions of the Mochi s
 import torch
 from diffusers import MochiPipeline, MochiTransformer3DModel
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "genmo/mochi-1-preview"
 
@@ -250,7 +262,7 @@ pipe = MochiPipeline.from_pretrained(model_id,  transformer=transformer)
 pipe.enable_model_cpu_offload()
 pipe.enable_vae_tiling()
 
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
+with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False):
     frames = pipe(
         prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
         negative_prompt="",
@@ -260,7 +272,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals
         num_inference_steps=50,
         guidance_scale=4.5,
         num_videos_per_prompt=1,
-        generator=torch.Generator(device="cuda").manual_seed(0),
+        generator=torch.Generator(device=device).manual_seed(0),
         max_sequence_length=256,
         output_type="pil",
     ).frames[0]
diff --git a/docs/source/en/api/pipelines/omnigen.md b/docs/source/en/api/pipelines/omnigen.md
index 074e7b8f0115..8ca32866a422 100644
--- a/docs/source/en/api/pipelines/omnigen.md
+++ b/docs/source/en/api/pipelines/omnigen.md
@@ -36,9 +36,12 @@ First, load the pipeline:
 ```python
 import torch
 from diffusers import OmniGenPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = OmniGenPipeline.from_pretrained("Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
 ```
 
 For text-to-image, pass a text prompt. By default, OmniGen generates a 1024x1024 image. 
diff --git a/docs/source/en/api/pipelines/pixart_sigma.md b/docs/source/en/api/pipelines/pixart_sigma.md
index dded4ea2d771..ec513189b243 100644
--- a/docs/source/en/api/pipelines/pixart_sigma.md
+++ b/docs/source/en/api/pipelines/pixart_sigma.md
@@ -86,10 +86,13 @@ Since text embeddings have been computed, remove the `text_encoder` and `pipe` f
 
 ```python
 import gc
+from diffusers.utils.torch_utils import get_device
 
 def flush():
     gc.collect()
-    torch.cuda.empty_cache()
+    device = get_device
+    device_module = getattr(torch, device, torch.cuda)
+    device_module.empty_cache()
 
 del text_encoder
 del pipe
@@ -99,11 +102,13 @@ flush()
 Then compute the latents with the prompt embeddings as inputs:
 
 ```python
+device = get_device()
+
 pipe = PixArtSigmaPipeline.from_pretrained(
     "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
     text_encoder=None,
     torch_dtype=torch.float16,
-).to("cuda")
+).to(device)
 
 latents = pipe(
     negative_prompt=None,
diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md
index afdb3de5f447..3b7ac428450c 100644
--- a/docs/source/en/api/pipelines/qwenimage.md
+++ b/docs/source/en/api/pipelines/qwenimage.md
@@ -39,9 +39,12 @@ number of steps. Refer to the code snippet below:
 
 ```py
 from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
-import torch 
+from diffusers.utils.torch_utils import get_device
+import torch
 import math
 
+device = get_device()
+
 ckpt_id = "Qwen/Qwen-Image"
 
 # From
@@ -65,7 +68,7 @@ scheduler_config = {
 scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
 pipe = DiffusionPipeline.from_pretrained(
     ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
 pipe.load_lora_weights(
     "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors"
 )
diff --git a/docs/source/en/api/pipelines/stable_unclip.md b/docs/source/en/api/pipelines/stable_unclip.md
index 5abb6028c4cb..4fcc97f26c5a 100644
--- a/docs/source/en/api/pipelines/stable_unclip.md
+++ b/docs/source/en/api/pipelines/stable_unclip.md
@@ -35,8 +35,11 @@ Stable unCLIP can be leveraged for text-to-image generation by pipelining it wit
 import torch
 from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
 from diffusers.models import PriorTransformer
+from diffusers.utils.torch_utils import get_device
 from transformers import CLIPTokenizer, CLIPTextModelWithProjection
 
+device = get_device()
+
 prior_model_id = "kakaobrain/karlo-v1-alpha"
 data_type = torch.float16
 prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)
@@ -59,7 +62,7 @@ pipe = StableUnCLIPPipeline.from_pretrained(
     prior_scheduler=prior_scheduler,
 )
 
-pipe = pipe.to("cuda")
+pipe = pipe.to(device)
 wave_prompt = "dramatic wave, the Oceans roar, Strong wave spiral across the oceans as the waves unfurl into roaring crests; perfect wave form; perfect wave shape; dramatic wave shape; wave shape unbelievable; wave; wave shape spectacular"
 
 image = pipe(prompt=wave_prompt).images[0]
@@ -76,12 +79,15 @@ For text-to-image we use `stabilityai/stable-diffusion-2-1-unclip-small` as it w
 ```python
 from diffusers import StableUnCLIPImg2ImgPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 import torch
 
+device = get_device()
+
 pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
     "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
 )
-pipe = pipe.to("cuda")
+pipe = pipe.to(device)
 
 url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png"
 init_image = load_image(url)
diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md
index 5fe3789d8287..6169f62bc350 100644
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -47,10 +47,13 @@ To generate a video from prompt, run the following Python code:
 ```python
 import torch
 from diffusers import TextToVideoZeroPipeline
+from diffusers.utils.torch_utils import get_device
 import imageio
 
+device = get_device()
+
 model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
 
 prompt = "A panda is playing guitar on times square"
 result = pipe(prompt=prompt).images
@@ -69,10 +72,13 @@ We can also generate longer videos by doing the processing in a chunk-by-chunk m
 ```python
 import torch
 from diffusers import TextToVideoZeroPipeline
+from diffusers.utils.torch_utils import get_device
 import numpy as np
 
+device = get_device()
+
 model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
 seed = 0
 video_length = 24  #24 ÷ 4fps = 6 seconds
 chunk_size = 8
@@ -81,7 +87,7 @@ prompt = "A panda is playing guitar on times square"
 # Generate the video chunk-by-chunk
 result = []
 chunk_ids = np.arange(0, video_length, chunk_size - 1)
-generator = torch.Generator(device="cuda")
+generator = torch.Generator(device=device)
 for i in range(len(chunk_ids)):
     print(f"Processing chunk {i + 1} / {len(chunk_ids)}")
     ch_start = chunk_ids[i]
@@ -106,11 +112,14 @@ In order to use the SDXL model when generating a video from prompt, use the `Tex
 ```python
 import torch
 from diffusers import TextToVideoZeroSDXLPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "stabilityai/stable-diffusion-xl-base-1.0"
 pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
     model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
+).to(device)
 ```
 
 ### Text-To-Video with Pose Control
@@ -144,19 +153,22 @@ To generate a video from prompt with additional pose control
     import torch
     from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
     from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+    from diffusers.utils.torch_utils import get_device
+
+    device = get_device()
 
     model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
     controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
     pipe = StableDiffusionControlNetPipeline.from_pretrained(
         model_id, controlnet=controlnet, torch_dtype=torch.float16
-    ).to("cuda")
+    ).to(device)
 
     # Set the attention processor
     pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
     pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
 
     # fix latents for all frames
-    latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+    latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
 
     prompt = "Darth Vader dancing in a desert"
     result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
@@ -169,6 +181,9 @@ To generate a video from prompt with additional pose control
 	import torch
 	from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
 	from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+    from diffusers.utils.torch_utils import get_device
+
+    device = get_device()
 
 	controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0'
 	model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
@@ -176,14 +191,14 @@ To generate a video from prompt with additional pose control
 	controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16)
 	pipe = StableDiffusionControlNetPipeline.from_pretrained(
 		model_id, controlnet=controlnet, torch_dtype=torch.float16
-	).to('cuda')
+	).to(device)
 
 	# Set the attention processor
 	pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
 	pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
 
 	# fix latents for all frames
-	latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+	latents = torch.randn((1, 4, 128, 128), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
 
 	prompt = "Darth Vader dancing in a desert"
 	result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
@@ -224,9 +239,12 @@ To perform text-guided video editing (with [InstructPix2Pix](pix2pix)):
     import torch
     from diffusers import StableDiffusionInstructPix2PixPipeline
     from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+    from diffusers.utils.torch_utils import get_device
+
+    device = get_device()
 
     model_id = "timbrooks/instruct-pix2pix"
-    pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+    pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
     pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3))
 
     prompt = "make it Van Gogh Starry Night style"
@@ -267,20 +285,23 @@ can run with custom [DreamBooth](../../training/dreambooth) models, as shown bel
     import torch
     from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
     from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+    from diffusers.utils.torch_utils import get_device
+
+    device = get_device()
 
     # set model id to custom model
     model_id = "PAIR/text2video-zero-controlnet-canny-avatar"
     controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
     pipe = StableDiffusionControlNetPipeline.from_pretrained(
         model_id, controlnet=controlnet, torch_dtype=torch.float16
-    ).to("cuda")
+    ).to(device)
 
     # Set the attention processor
     pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
     pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
 
     # fix latents for all frames
-    latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1)
+    latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1)
 
     prompt = "oil painting of a beautiful girl avatar style"
     result = pipe(prompt=[prompt] * len(canny_edges), image=canny_edges, latents=latents).images
diff --git a/docs/source/en/api/pipelines/unidiffuser.md b/docs/source/en/api/pipelines/unidiffuser.md
index 7d767f2db530..738b4e166f71 100644
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -47,8 +47,9 @@ Unconditional generation (where we start from only latents sampled from a standa
 import torch
 
 from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
 
-device = "cuda"
+device = get_device()
 model_id_or_path = "thu-ml/unidiffuser-v1"
 pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
 pipe.to(device)
@@ -96,8 +97,9 @@ Here is an example of sampling from the conditional image distribution (text-to-
 import torch
 
 from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
 
-device = "cuda"
+device = get_device()
 model_id_or_path = "thu-ml/unidiffuser-v1"
 pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
 pipe.to(device)
@@ -121,8 +123,9 @@ import torch
 
 from diffusers import UniDiffuserPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 
-device = "cuda"
+device = get_device()
 model_id_or_path = "thu-ml/unidiffuser-v1"
 pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
 pipe.to(device)
@@ -148,8 +151,9 @@ import torch
 
 from diffusers import UniDiffuserPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 
-device = "cuda"
+device = get_device()
 model_id_or_path = "thu-ml/unidiffuser-v1"
 pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
 pipe.to(device)
@@ -177,8 +181,9 @@ Similarly, text variation can be performed on an input prompt with a text-to-ima
 import torch
 
 from diffusers import UniDiffuserPipeline
+from diffusers.utils.torch_utils import get_device
 
-device = "cuda"
+device = get_device()
 model_id_or_path = "thu-ml/unidiffuser-v1"
 pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
 pipe.to(device)
diff --git a/docs/source/en/api/pipelines/visualcloze.md b/docs/source/en/api/pipelines/visualcloze.md
index 1a4f96a50d63..a3c55b193ea4 100644
--- a/docs/source/en/api/pipelines/visualcloze.md
+++ b/docs/source/en/api/pipelines/visualcloze.md
@@ -68,9 +68,12 @@ For comprehensive examples covering a wide range of tasks, please refer to the [
 import torch
 from diffusers import VisualClozePipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
 
 # Load in-context images (make sure the paths are correct and accessible)
 image_paths = [
@@ -120,9 +123,12 @@ image_result.save("visualcloze.png")
 import torch
 from diffusers import VisualClozePipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
 
 # Load in-context images (make sure the paths are correct and accessible)
 image_paths = [
@@ -170,9 +176,12 @@ image_result.save("visualcloze.png")
 import torch
 from diffusers import VisualClozePipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
+pipe.to(device)
 
 # Load in-context images (make sure the paths are correct and accessible)
 image_paths = [
@@ -226,12 +235,15 @@ image_result.save("visualcloze.png")
 import torch
 from diffusers import VisualClozeGenerationPipeline, FluxFillPipeline as VisualClozeUpsamplingPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from PIL import Image
 
+device = get_device()
+
 pipe = VisualClozeGenerationPipeline.from_pretrained(
     "VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 image_paths = [
     # in-context examples
@@ -267,7 +279,7 @@ image = pipe(
 
 # Stage 2 (optional): Upsample the generated image
 pipe_upsample = VisualClozeUpsamplingPipeline.from_pipe(pipe)
-pipe_upsample.to("cuda")
+pipe_upsample.to(device)
 
 mask_image = Image.new("RGB", image.size, (255, 255, 255))
 
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
index e46aa55ad82a..ba912d55ab08 100644
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -63,14 +63,17 @@ from diffusers import AutoModel, WanPipeline
 from diffusers.quantizers import PipelineQuantizationConfig
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
 from transformers import UMT5EncoderModel
 
+device = get_device()
+
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
 vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
 transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 
 # group-offloading
-onload_device = torch.device("cuda")
+onload_device = torch.device(device)
 offload_device = torch.device("cpu")
 apply_group_offloading(text_encoder,
     onload_device=onload_device,
@@ -92,7 +95,7 @@ pipeline = WanPipeline.from_pretrained(
     text_encoder=text_encoder,
     torch_dtype=torch.bfloat16
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 prompt = """
 The camera rushes from far to near in a low-angle shot, 
@@ -128,8 +131,11 @@ import numpy as np
 from diffusers import AutoModel, WanPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
 from transformers import UMT5EncoderModel
 
+device = get_device()
+
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
 vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
 transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
@@ -141,7 +147,7 @@ pipeline = WanPipeline.from_pretrained(
     text_encoder=text_encoder,
     torch_dtype=torch.bfloat16
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 # torch.compile
 pipeline.transformer.to(memory_format=torch.channels_last)
@@ -187,8 +193,10 @@ import torch
 import torchvision.transforms.functional as TF
 from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
 from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
 from transformers import CLIPVisionModel
 
+device = get_device()
 
 model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
 image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
@@ -196,7 +204,7 @@ vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=to
 pipe = WanImageToVideoPipeline.from_pretrained(
     model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
 last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
@@ -262,6 +270,9 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
   from diffusers import AutoModel, WanPipeline
   from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
   from diffusers.utils import export_to_video
+  from diffusers.utils.torch_utils import get_device
+
+  device = get_device()
 
   vae = AutoModel.from_pretrained(
       "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
@@ -272,7 +283,7 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
   pipeline.scheduler = UniPCMultistepScheduler.from_config(
       pipeline.scheduler.config, flow_shift=5.0
   )
-  pipeline.to("cuda")
+  pipeline.to(device)
 
   pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie")
   pipeline.set_adapters("steamboat-willie")
@@ -359,4 +370,4 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
 
 ## WanPipelineOutput
 
-[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
\ No newline at end of file
+[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index d322d76be267..564c05131f5d 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -31,6 +31,9 @@ Now you can quantize a model by passing the `QuantoConfig` object to the `from_p
 ```python
 import torch
 from diffusers import FluxTransformer2DModel, QuantoConfig
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "black-forest-labs/FLUX.1-dev"
 quantization_config = QuantoConfig(weights_dtype="float8")
@@ -42,7 +45,7 @@ transformer = FluxTransformer2DModel.from_pretrained(
 )
 
 pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype)
-pipe.to("cuda")
+pipe.to(device)
 
 prompt = "A cat holding a sign that says hello world"
 image = pipe(
@@ -117,6 +120,9 @@ Currently the Quanto backend supports `torch.compile` for the following quantiza
 ```python
 import torch
 from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "black-forest-labs/FLUX.1-dev"
 quantization_config = QuantoConfig(weights_dtype="int8")
@@ -131,7 +137,7 @@ transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
 pipe = FluxPipeline.from_pretrained(
     model_id, transformer=transformer, torch_dtype=torch_dtype
 )
-pipe.to("cuda")
+pipe.to(device)
 images = pipe("A cat holding a sign that says hello").images[0]
 images.save("flux-quanto-compile.png")
 ```
diff --git a/docs/source/en/using-diffusers/controlnet.md b/docs/source/en/using-diffusers/controlnet.md
index 4aa5abd04f3b..f376b56a48d2 100644
--- a/docs/source/en/using-diffusers/controlnet.md
+++ b/docs/source/en/using-diffusers/controlnet.md
@@ -52,14 +52,17 @@ Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` pa
 ```py
 import torch
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from diffusers import FluxControlNetPipeline, FluxControlNetModel
 
+device = get_device()
+
 controlnet = FluxControlNetModel.from_pretrained(
     "InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16
 )
 pipeline = FluxControlNetPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
 
 prompt = """
 A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
@@ -103,14 +106,16 @@ from PIL import Image
 from transformers import DPTImageProcessor, DPTForDepthEstimation
 from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 
+device = get_device()
 
-depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
 feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
 
 def get_depth_map(image):
-    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
-    with torch.no_grad(), torch.autocast("cuda"):
+    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
+    with torch.no_grad(), torch.autocast(device):
         depth_map = depth_estimator(image).predicted_depth
 
     depth_map = torch.nn.functional.interpolate(
@@ -143,7 +148,7 @@ pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
     controlnet=controlnet,
     vae=vae,
     torch_dtype=torch.float16,
-).to("cuda")
+).to(device)
 
 prompt = """
 A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
@@ -260,6 +265,9 @@ Pass the ControlNets as a list to the pipeline and resize the images to the expe
 ```py
 import torch
 from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 controlnets = [
     ControlNetModel.from_pretrained(
@@ -273,7 +281,7 @@ controlnets = [
 vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
 pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 
 prompt = """
 a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby, 
@@ -316,6 +324,9 @@ pipeline(
 import torch
 from diffusers.utils import load_iamge
 from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 controlnet = ControlNetModel.from_pretrained(
   "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
@@ -324,7 +335,7 @@ pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
   controlnet=controlnet,
   torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 
 canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png")
 pipeline(
@@ -343,4 +354,4 @@ pipeline(
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guess_mode.png" width="300" alt="Generated image (Guess mode)"/>
     <figcaption style="text-align: center;">generated image</figcaption>
   </figure>
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/source/en/using-diffusers/dreambooth.md b/docs/source/en/using-diffusers/dreambooth.md
index 6c37124cb7ff..4c620adbb110 100644
--- a/docs/source/en/using-diffusers/dreambooth.md
+++ b/docs/source/en/using-diffusers/dreambooth.md
@@ -21,15 +21,18 @@ Load the DreamBooth checkpoint with [`~DiffusionPipeline.from_pretrained`] and i
 ```py
 import torch
 from diffusers import AutoPipelineForText2Image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = AutoPipelineForText2Image.from_pretrained(
     "sd-dreambooth-library/herge-style",
     torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 prompt = "A cute sks herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
 pipeline(prompt).images[0]
 ```
 
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_dreambooth.png" />
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/source/en/using-diffusers/omnigen.md b/docs/source/en/using-diffusers/omnigen.md
index 2880fedb3392..36de8d791c31 100644
--- a/docs/source/en/using-diffusers/omnigen.md
+++ b/docs/source/en/using-diffusers/omnigen.md
@@ -37,12 +37,15 @@ You can try setting the `height` and `width` parameters to generate images with
 ```python
 import torch
 from diffusers import OmniGenPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt = "Realistic photo. A young woman sits on a sofa, holding a book and facing the camera. She wears delicate silver hoop earrings adorned with tiny, sparkling diamonds that catch the light, with her long chestnut hair cascading over her shoulders. Her eyes are focused and gentle, framed by long, dark lashes. She is dressed in a cozy cream sweater, which complements her warm, inviting smile. Behind her, there is a table with a cup of water in a sleek, minimalist blue mug. The background is a serene indoor setting with soft natural light filtering through a window, adorned with tasteful art and flowers, creating a cozy and peaceful ambiance. 4K, HD."
 image = pipe(
@@ -68,13 +71,16 @@ It is recommended to enable `use_input_image_size_as_output` to keep the edited
 ```python
 import torch
 from diffusers import OmniGenPipeline
-from diffusers.utils import load_image 
+from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt="<img><|image_1|></img> Remove the woman's earrings. Replace the mug with a clear glass filled with sparkling iced cola."
 input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/t2i_woman_with_book.png")]
@@ -127,13 +133,16 @@ OmniGen can handle several classic computer vision tasks. As shown below, OmniGe
 ```python
 import torch
 from diffusers import OmniGenPipeline
-from diffusers.utils import load_image 
+from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device() 
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt="Detect the skeleton of human in this image: <img><|image_1|></img>"
 input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/edit.png")]
@@ -181,13 +190,16 @@ OmniGen can also directly use relevant information from input images to generate
 ```python
 import torch
 from diffusers import OmniGenPipeline
-from diffusers.utils import load_image 
+from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device() 
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt="Following the pose of this image <img><|image_1|></img>, generate a new photo: A young boy is sitting on a sofa in the library, holding a book. His hair is neatly combed, and a faint smile plays on his lips, with a few freckles scattered across his cheeks. The library is quiet, with rows of shelves filled with books stretching out behind him."
 input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/edit.png")]
@@ -218,12 +230,15 @@ Additionally, OmniGen can extract desired objects from an image containing multi
 import torch
 from diffusers import OmniGenPipeline
 from diffusers.utils import load_image 
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt="A man and a woman are sitting at a classroom desk. The man is the man with yellow hair in <img><|image_1|></img>. The woman is the woman on the left of <img><|image_2|></img>"
 input_image_1 = load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/3.png")
@@ -259,13 +274,16 @@ image.save("output.png")
 ```py
 import torch
 from diffusers import OmniGenPipeline
-from diffusers.utils import load_image 
+from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device() 
 
 pipe = OmniGenPipeline.from_pretrained(
     "Shitao/OmniGen-v1-diffusers",
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 prompt="A woman is walking down the street, wearing a white long-sleeve blouse with lace details on the sleeves, paired with a blue pleated skirt. The woman is <img><|image_1|></img>. The long-sleeve blouse and a pleated skirt are <img><|image_2|></img>."
 input_image_1 = load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/emma.jpeg")
diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md
index 67d1fd118e4d..322572ccce31 100644
--- a/docs/source/en/using-diffusers/text-img2vid.md
+++ b/docs/source/en/using-diffusers/text-img2vid.md
@@ -28,14 +28,17 @@ import numpy as np
 from diffusers import AutoModel, WanPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
 from transformers import UMT5EncoderModel
 
+device = get_device()
+
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
 vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
 transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 
 # group-offloading
-onload_device = torch.device("cuda")
+onload_device = torch.device(device)
 offload_device = torch.device("cpu")
 apply_group_offloading(text_encoder,
     onload_device=onload_device,
@@ -57,7 +60,7 @@ pipeline = WanPipeline.from_pretrained(
     text_encoder=text_encoder,
     torch_dtype=torch.bfloat16
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 prompt = """
 The camera rushes from far to near in a low-angle shot, 
@@ -124,6 +127,9 @@ import torch
 from diffusers import LTXPipeline, AutoModel
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 # fp8 layerwise weight-casting
 transformer = AutoModel.from_pretrained(
@@ -138,7 +144,7 @@ transformer.enable_layerwise_casting(
 pipeline = LTXPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, torch_dtype=torch.bfloat16)
 
 # group-offloading
-onload_device = torch.device("cuda")
+onload_device = torch.device(device)
 offload_device = torch.device("cpu")
 pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
 apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
@@ -184,10 +190,13 @@ Some video models require more specific `num_frames` values for inference. For e
 import torch
 from diffusers import LTXPipeline
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = LTXPipeline.from_pretrained(
     "Lightricks/LTX-Video", torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
 
 prompt = """
 A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman 
@@ -218,11 +227,14 @@ Guidance scale or "cfg" controls how closely the generated frames adhere to the
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = CogVideoXPipeline.from_pretrained(
   "THUDM/CogVideoX-2b",
   torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 
 prompt = """
 A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over
@@ -251,6 +263,9 @@ import torch
 from diffusers import WanPipeline
 from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 vae = AutoencoderKLWan.from_pretrained(
   "Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32
@@ -261,7 +276,7 @@ pipeline = WanPipeline.from_pretrained(
 pipeline.scheduler = UniPCMultistepScheduler.from_config(
   pipeline.scheduler.config, flow_shift=5.0
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 pipeline.load_lora_weights("benjamin-paine/steamboat-willie-14b", adapter_name="steamboat-willie")
 pipeline.set_adapters("steamboat-willie")
@@ -301,14 +316,17 @@ import numpy as np
 from diffusers import AutoModel, WanPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
+from diffusers.utils.torch_utils import get_device
 from transformers import UMT5EncoderModel
 
+device = get_device()
+
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
 vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
 transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 
 # group-offloading
-onload_device = torch.device("cuda")
+onload_device = torch.device(device)
 offload_device = torch.device("cpu")
 apply_group_offloading(text_encoder,
     onload_device=onload_device,
@@ -330,7 +348,7 @@ pipeline = WanPipeline.from_pretrained(
     text_encoder=text_encoder,
     torch_dtype=torch.bfloat16
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 prompt = """
 The camera rushes from far to near in a low-angle shot, 
@@ -368,6 +386,9 @@ from diffusers.quantizers import PipelineQuantizationConfig
 from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
 from transformers import UMT5EncoderModel
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 # quantize transformer and text encoder weights with bitsandbytes
 pipeline_quant_config = PipelineQuantizationConfig(
@@ -385,7 +406,7 @@ pipeline = WanPipeline.from_pretrained(
 pipeline.scheduler = UniPCMultistepScheduler.from_config(
   pipeline.scheduler.config, flow_shift=5.0
 )
-pipeline.to("cuda")
+pipeline.to(device)
 
 pipeline.load_lora_weights("benjamin-paine/steamboat-willie-14b", adapter_name="steamboat-willie")
 pipeline.set_adapters("steamboat-willie")
@@ -419,11 +440,14 @@ The example below compiles the transformer in the pipeline and uses the `"max-au
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
 from diffusers.utils import export_to_video
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = CogVideoXPipeline.from_pretrained(
   "THUDM/CogVideoX-2b",
   torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 
 # torch.compile
 pipeline.transformer.to(memory_format=torch.channels_last)
@@ -444,4 +468,4 @@ video = pipeline(
   num_inference_steps=50
 ).frames[0]
 export_to_video(video, "output.mp4", fps=8)
-```
\ No newline at end of file
+```