diff --git a/README.md b/README.md index dac3b3598aaf..4d1fd40ed0f2 100644 --- a/README.md +++ b/README.md @@ -71,10 +71,13 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from ```python from diffusers import DiffusionPipeline +from diffusers.utils.torch_utils import get_device import torch +device = get_device() + pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16) -pipeline.to("cuda") +pipeline.to(device) pipeline("An image of a squirrel in Picasso style").images[0] ``` @@ -82,15 +85,18 @@ You can also dig into the models and schedulers toolbox to build your own diffus ```python from diffusers import DDPMScheduler, UNet2DModel +from diffusers.utils.torch_utils import get_device from PIL import Image import torch +device = get_device() + scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256") -model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda") +model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to(device) scheduler.set_timesteps(50) sample_size = model.config.sample_size -noise = torch.randn((1, 3, sample_size, sample_size), device="cuda") +noise = torch.randn((1, 3, sample_size, sample_size), device=device) input = noise for t in scheduler.timesteps: diff --git a/docs/source/en/api/models/autoencoderkl_cogvideox.md b/docs/source/en/api/models/autoencoderkl_cogvideox.md index 2c5411a0647c..7a3ee76c91b8 100644 --- a/docs/source/en/api/models/autoencoderkl_cogvideox.md +++ b/docs/source/en/api/models/autoencoderkl_cogvideox.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import AutoencoderKLCogVideoX +from diffusers.utils.torch_utils import get_device -vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda") +device = get_device() + +vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to(device) ``` ## AutoencoderKLCogVideoX diff --git a/docs/source/en/api/models/autoencoderkl_ltx_video.md b/docs/source/en/api/models/autoencoderkl_ltx_video.md index 9c2384ca53a1..61edd75539a4 100644 --- a/docs/source/en/api/models/autoencoderkl_ltx_video.md +++ b/docs/source/en/api/models/autoencoderkl_ltx_video.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import AutoencoderKLLTXVideo +from diffusers.utils.torch_utils import get_device -vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda") +device = get_device() + +vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to(device) ``` ## AutoencoderKLLTXVideo diff --git a/docs/source/en/api/models/autoencoderkl_mochi.md b/docs/source/en/api/models/autoencoderkl_mochi.md index fef6645a18fa..25b8b64cfdc6 100644 --- a/docs/source/en/api/models/autoencoderkl_mochi.md +++ b/docs/source/en/api/models/autoencoderkl_mochi.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import AutoencoderKLMochi +from diffusers.utils.torch_utils import get_device -vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda") +device = get_device() + +vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to(device) ``` ## AutoencoderKLMochi diff --git a/docs/source/en/api/models/cogvideox_transformer3d.md b/docs/source/en/api/models/cogvideox_transformer3d.md index 5d50e5dca651..bb0e57788691 100644 --- a/docs/source/en/api/models/cogvideox_transformer3d.md +++ b/docs/source/en/api/models/cogvideox_transformer3d.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import CogVideoXTransformer3DModel +from diffusers.utils.torch_utils import get_device -transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda") +device = get_device() + +transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to(device) ``` ## CogVideoXTransformer3DModel diff --git a/docs/source/en/api/models/cogview4_transformer2d.md b/docs/source/en/api/models/cogview4_transformer2d.md index e87fbc680968..b2bd127ebb1f 100644 --- a/docs/source/en/api/models/cogview4_transformer2d.md +++ b/docs/source/en/api/models/cogview4_transformer2d.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import CogView4Transformer2DModel +from diffusers.utils.torch_utils import get_device -transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda") +device = get_device() + +transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to(device) ``` ## CogView4Transformer2DModel diff --git a/docs/source/en/api/models/consisid_transformer3d.md b/docs/source/en/api/models/consisid_transformer3d.md index 0531d475d2fb..ba2fc0bd3ece 100644 --- a/docs/source/en/api/models/consisid_transformer3d.md +++ b/docs/source/en/api/models/consisid_transformer3d.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import ConsisIDTransformer3DModel +from diffusers.utils.torch_utils import get_device -transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda") +device = get_device() + +transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to(device) ``` ## ConsisIDTransformer3DModel diff --git a/docs/source/en/api/models/ltx_video_transformer3d.md b/docs/source/en/api/models/ltx_video_transformer3d.md index 5a2a1af9d821..f5fb27e71640 100644 --- a/docs/source/en/api/models/ltx_video_transformer3d.md +++ b/docs/source/en/api/models/ltx_video_transformer3d.md @@ -17,8 +17,11 @@ The model can be loaded with the following code snippet. ```python from diffusers import LTXVideoTransformer3DModel +from diffusers.utils.torch_utils import get_device -transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda") +device = get_device() + +transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to(device) ``` ## LTXVideoTransformer3DModel diff --git a/docs/source/en/api/pipelines/consistency_models.md b/docs/source/en/api/pipelines/consistency_models.md index 4f7b2f0fb501..64ab252c0e13 100644 --- a/docs/source/en/api/pipelines/consistency_models.md +++ b/docs/source/en/api/pipelines/consistency_models.md @@ -29,8 +29,9 @@ For an additional speed-up, use `torch.compile` to generate multiple images in < ```diff import torch from diffusers import ConsistencyModelPipeline + from diffusers.utils.torch_utils import get_device - device = "cuda" + device = get_device() # Load the cd_bedroom256_lpips checkpoint. model_id_or_path = "openai/diffusers-cd_bedroom256_lpips" pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index 64341ca4b918..578a6eb7641b 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -102,12 +102,15 @@ out.save("image.png") import torch from diffusers import FluxFillPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup.png") mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup_mask.png") repo_id = "black-forest-labs/FLUX.1-Fill-dev" -pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to("cuda") +pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to(device) image = pipe( prompt="a white paper cup", @@ -131,8 +134,11 @@ import torch from controlnet_aux import CannyDetector from diffusers import FluxControlPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() -pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to("cuda") +pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to(device) prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png") @@ -159,8 +165,11 @@ import torch from controlnet_aux import CannyDetector from diffusers import FluxControlPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() -pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda") +pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device) pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora") prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." @@ -189,9 +198,12 @@ image.save("output.png") import torch from diffusers import FluxControlPipeline, FluxTransformer2DModel from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device from image_gen_aux import DepthPreprocessor -pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to("cuda") +device = get_device() + +pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to(device) prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png") @@ -218,9 +230,12 @@ Depth Control is also possible with a LoRA variant of this condition. The usage import torch from diffusers import FluxControlPipeline, FluxTransformer2DModel from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device from image_gen_aux import DepthPreprocessor -pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda") +device = get_device() + +pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device) pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora") prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." @@ -251,9 +266,10 @@ image.save("output.png") import torch from diffusers import FluxPriorReduxPipeline, FluxPipeline from diffusers.utils import load_image -device = "cuda" -dtype = torch.bfloat16 +from diffusers.utils.torch_utils import get_device +device = get_device() +dtype = torch.bfloat16 repo_redux = "black-forest-labs/FLUX.1-Redux-dev" repo_base = "black-forest-labs/FLUX.1-dev" @@ -284,11 +300,14 @@ Flux Kontext is a model that allows in-context control of the image generation p import torch from diffusers import FluxKontextPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = FluxKontextPipeline.from_pretrained( "black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png").convert("RGB") prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors" @@ -305,13 +324,16 @@ Flux Kontext comes with an integrity safety checker, which should be run after t ```python from flux.content_filters import PixtralContentFilter +from diffusers.utils.torch_utils import get_device + +device = get_device() # ... pipeline invocation to generate images -integrity_checker = PixtralContentFilter(torch.device("cuda")) +integrity_checker = PixtralContentFilter(torch.device(device)) image_ = np.array(image) / 255.0 image_ = 2 * image_ - 1 -image_ = torch.from_numpy(image_).to("cuda", dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2) +image_ = torch.from_numpy(image_).to(device, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2) if integrity_checker.test_image(image_): raise ValueError("Your image has been flagged. Choose another prompt/image or try again.") ``` @@ -371,10 +393,13 @@ An IP-Adapter lets you prompt Flux with images, in addition to the text prompt. import torch from diffusers import FluxPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16 -).to("cuda") +).to(device) image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_input.jpg").resize((1024, 1024)) @@ -411,7 +436,7 @@ Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeli [Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level. -On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference. +On accelerator devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference. > [!TIP] > It is possible to mix block and leaf-level offloading for different components in a pipeline. @@ -420,6 +445,9 @@ On CUDA devices that support asynchronous data streaming, set `use_stream=True` import torch from diffusers import FluxPipeline from diffusers.hooks import apply_group_offloading +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "black-forest-labs/FLUX.1-dev" dtype = torch.bfloat16 @@ -432,27 +460,27 @@ apply_group_offloading( pipe.transformer, offload_type="leaf_level", offload_device=torch.device("cpu"), - onload_device=torch.device("cuda"), + onload_device=torch.device(device), use_stream=True, ) apply_group_offloading( pipe.text_encoder, offload_device=torch.device("cpu"), - onload_device=torch.device("cuda"), + onload_device=torch.device(device), offload_type="leaf_level", use_stream=True, ) apply_group_offloading( pipe.text_encoder_2, offload_device=torch.device("cpu"), - onload_device=torch.device("cuda"), + onload_device=torch.device(device), offload_type="leaf_level", use_stream=True, ) apply_group_offloading( pipe.vae, offload_device=torch.device("cpu"), - onload_device=torch.device("cuda"), + onload_device=torch.device(device), offload_type="leaf_level", use_stream=True, ) diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md index 07e869ba95ae..d7d86ce81897 100644 --- a/docs/source/en/api/pipelines/hunyuandit.md +++ b/docs/source/en/api/pipelines/hunyuandit.md @@ -52,11 +52,14 @@ First, load the pipeline: ```python from diffusers import HunyuanDiTPipeline +from diffusers.utils.torch_utils import get_device import torch +device = get_device() + pipeline = HunyuanDiTPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16 -).to("cuda") +).to(device) ``` Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`: diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md index 048f6c1de980..5a01f332cd20 100644 --- a/docs/source/en/api/pipelines/kolors.md +++ b/docs/source/en/api/pipelines/kolors.md @@ -31,9 +31,12 @@ The abstract from the technical report is: import torch from diffusers import DPMSolverMultistepScheduler, KolorsPipeline +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = KolorsPipeline.from_pretrained("Kwai-Kolors/Kolors-diffusers", torch_dtype=torch.float16, variant="fp16") -pipe.to("cuda") +pipe.to(device) pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True) image = pipe( diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md index 9d4d12dd4e02..a2e11a0a1d53 100644 --- a/docs/source/en/api/pipelines/latte.md +++ b/docs/source/en/api/pipelines/latte.md @@ -41,10 +41,13 @@ First, load the pipeline: ```python import torch from diffusers import LattePipeline +from diffusers.utils.torch_utils import get_device + +device = get_device() pipeline = LattePipeline.from_pretrained( "maxin-cn/Latte-1", torch_dtype=torch.float16 -).to("cuda") +).to(device) ``` Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`: diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md index f1260b07b077..897f70ad5ea7 100644 --- a/docs/source/en/api/pipelines/mochi.md +++ b/docs/source/en/api/pipelines/mochi.md @@ -81,6 +81,9 @@ The following example will download the full precision `mochi-1-preview` weights import torch from diffusers import MochiPipeline from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview") @@ -90,7 +93,7 @@ pipe.enable_vae_tiling() prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k." -with torch.autocast("cuda", torch.bfloat16, cache_enabled=False): +with torch.autocast(device, torch.bfloat16, cache_enabled=False): frames = pipe(prompt, num_frames=85).frames[0] export_to_video(frames, "mochi.mp4", fps=30) @@ -137,8 +140,11 @@ from torch.nn.attention import SDPBackend, sdpa_kernel from diffusers import MochiPipeline from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device from diffusers.video_processor import VideoProcessor +device = get_device() + pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", force_zeros_for_empty_prompt=True) pipe.enable_vae_tiling() pipe.enable_model_cpu_offload() @@ -150,7 +156,7 @@ with torch.no_grad(): pipe.encode_prompt(prompt=prompt) ) -with torch.autocast("cuda", torch.bfloat16): +with torch.autocast(device, torch.bfloat16): with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION): frames = pipe( prompt_embeds=prompt_embeds, @@ -162,7 +168,7 @@ with torch.autocast("cuda", torch.bfloat16): height=480, width=848, num_frames=163, - generator=torch.Generator("cuda").manual_seed(0), + generator=torch.Generator(device).manual_seed(0), output_type="latent", return_dict=False, )[0] @@ -188,14 +194,17 @@ video = video_processor.postprocess_video(video)[0] export_to_video(video, "mochi.mp4", fps=30) ``` -## Running inference with multiple GPUs +## Running inference with multiple accelerators -It is possible to split the large Mochi transformer across multiple GPUs using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two GPUs, each with 24GB of VRAM. +It is possible to split the large Mochi transformer across multiple accelerators using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two accelerators, each with 24GB of VRAM. ```python import torch from diffusers import MochiPipeline, MochiTransformer3DModel from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "genmo/mochi-1-preview" transformer = MochiTransformer3DModel.from_pretrained( @@ -209,7 +218,7 @@ pipe = MochiPipeline.from_pretrained(model_id, transformer=transformer) pipe.enable_model_cpu_offload() pipe.enable_vae_tiling() -with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False): +with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False): frames = pipe( prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.", negative_prompt="", @@ -219,7 +228,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals num_inference_steps=50, guidance_scale=4.5, num_videos_per_prompt=1, - generator=torch.Generator(device="cuda").manual_seed(0), + generator=torch.Generator(device=device).manual_seed(0), max_sequence_length=256, output_type="pil", ).frames[0] @@ -239,6 +248,9 @@ Diffusers currently doesn't support using the FP8 scaled versions of the Mochi s import torch from diffusers import MochiPipeline, MochiTransformer3DModel from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "genmo/mochi-1-preview" @@ -250,7 +262,7 @@ pipe = MochiPipeline.from_pretrained(model_id, transformer=transformer) pipe.enable_model_cpu_offload() pipe.enable_vae_tiling() -with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False): +with torch.autocast(device_type=device, dtype=torch.bfloat16, cache_enabled=False): frames = pipe( prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.", negative_prompt="", @@ -260,7 +272,7 @@ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=Fals num_inference_steps=50, guidance_scale=4.5, num_videos_per_prompt=1, - generator=torch.Generator(device="cuda").manual_seed(0), + generator=torch.Generator(device=device).manual_seed(0), max_sequence_length=256, output_type="pil", ).frames[0] diff --git a/docs/source/en/api/pipelines/omnigen.md b/docs/source/en/api/pipelines/omnigen.md index 074e7b8f0115..8ca32866a422 100644 --- a/docs/source/en/api/pipelines/omnigen.md +++ b/docs/source/en/api/pipelines/omnigen.md @@ -36,9 +36,12 @@ First, load the pipeline: ```python import torch from diffusers import OmniGenPipeline +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained("Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16) -pipe.to("cuda") +pipe.to(device) ``` For text-to-image, pass a text prompt. By default, OmniGen generates a 1024x1024 image. diff --git a/docs/source/en/api/pipelines/pixart_sigma.md b/docs/source/en/api/pipelines/pixart_sigma.md index dded4ea2d771..ec513189b243 100644 --- a/docs/source/en/api/pipelines/pixart_sigma.md +++ b/docs/source/en/api/pipelines/pixart_sigma.md @@ -86,10 +86,13 @@ Since text embeddings have been computed, remove the `text_encoder` and `pipe` f ```python import gc +from diffusers.utils.torch_utils import get_device def flush(): gc.collect() - torch.cuda.empty_cache() + device = get_device + device_module = getattr(torch, device, torch.cuda) + device_module.empty_cache() del text_encoder del pipe @@ -99,11 +102,13 @@ flush() Then compute the latents with the prompt embeddings as inputs: ```python +device = get_device() + pipe = PixArtSigmaPipeline.from_pretrained( "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", text_encoder=None, torch_dtype=torch.float16, -).to("cuda") +).to(device) latents = pipe( negative_prompt=None, diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md index afdb3de5f447..3b7ac428450c 100644 --- a/docs/source/en/api/pipelines/qwenimage.md +++ b/docs/source/en/api/pipelines/qwenimage.md @@ -39,9 +39,12 @@ number of steps. Refer to the code snippet below: ```py from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler -import torch +from diffusers.utils.torch_utils import get_device +import torch import math +device = get_device() + ckpt_id = "Qwen/Qwen-Image" # From @@ -65,7 +68,7 @@ scheduler_config = { scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config) pipe = DiffusionPipeline.from_pretrained( ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16 -).to("cuda") +).to(device) pipe.load_lora_weights( "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors" ) diff --git a/docs/source/en/api/pipelines/stable_unclip.md b/docs/source/en/api/pipelines/stable_unclip.md index 5abb6028c4cb..4fcc97f26c5a 100644 --- a/docs/source/en/api/pipelines/stable_unclip.md +++ b/docs/source/en/api/pipelines/stable_unclip.md @@ -35,8 +35,11 @@ Stable unCLIP can be leveraged for text-to-image generation by pipelining it wit import torch from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline from diffusers.models import PriorTransformer +from diffusers.utils.torch_utils import get_device from transformers import CLIPTokenizer, CLIPTextModelWithProjection +device = get_device() + prior_model_id = "kakaobrain/karlo-v1-alpha" data_type = torch.float16 prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type) @@ -59,7 +62,7 @@ pipe = StableUnCLIPPipeline.from_pretrained( prior_scheduler=prior_scheduler, ) -pipe = pipe.to("cuda") +pipe = pipe.to(device) wave_prompt = "dramatic wave, the Oceans roar, Strong wave spiral across the oceans as the waves unfurl into roaring crests; perfect wave form; perfect wave shape; dramatic wave shape; wave shape unbelievable; wave; wave shape spectacular" image = pipe(prompt=wave_prompt).images[0] @@ -76,12 +79,15 @@ For text-to-image we use `stabilityai/stable-diffusion-2-1-unclip-small` as it w ```python from diffusers import StableUnCLIPImg2ImgPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device import torch +device = get_device() + pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16" ) -pipe = pipe.to("cuda") +pipe = pipe.to(device) url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png" init_image = load_image(url) diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md index 5fe3789d8287..6169f62bc350 100644 --- a/docs/source/en/api/pipelines/text_to_video_zero.md +++ b/docs/source/en/api/pipelines/text_to_video_zero.md @@ -47,10 +47,13 @@ To generate a video from prompt, run the following Python code: ```python import torch from diffusers import TextToVideoZeroPipeline +from diffusers.utils.torch_utils import get_device import imageio +device = get_device() + model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" -pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") +pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device) prompt = "A panda is playing guitar on times square" result = pipe(prompt=prompt).images @@ -69,10 +72,13 @@ We can also generate longer videos by doing the processing in a chunk-by-chunk m ```python import torch from diffusers import TextToVideoZeroPipeline +from diffusers.utils.torch_utils import get_device import numpy as np +device = get_device() + model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" -pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") +pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device) seed = 0 video_length = 24 #24 ÷ 4fps = 6 seconds chunk_size = 8 @@ -81,7 +87,7 @@ prompt = "A panda is playing guitar on times square" # Generate the video chunk-by-chunk result = [] chunk_ids = np.arange(0, video_length, chunk_size - 1) -generator = torch.Generator(device="cuda") +generator = torch.Generator(device=device) for i in range(len(chunk_ids)): print(f"Processing chunk {i + 1} / {len(chunk_ids)}") ch_start = chunk_ids[i] @@ -106,11 +112,14 @@ In order to use the SDXL model when generating a video from prompt, use the `Tex ```python import torch from diffusers import TextToVideoZeroSDXLPipeline +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "stabilityai/stable-diffusion-xl-base-1.0" pipe = TextToVideoZeroSDXLPipeline.from_pretrained( model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True -).to("cuda") +).to(device) ``` ### Text-To-Video with Pose Control @@ -144,19 +153,22 @@ To generate a video from prompt with additional pose control import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor + from diffusers.utils.torch_utils import get_device + + device = get_device() model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( model_id, controlnet=controlnet, torch_dtype=torch.float16 - ).to("cuda") + ).to(device) # Set the attention processor pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) # fix latents for all frames - latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) + latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) prompt = "Darth Vader dancing in a desert" result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images @@ -169,6 +181,9 @@ To generate a video from prompt with additional pose control import torch from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor + from diffusers.utils.torch_utils import get_device + + device = get_device() controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0' model_id = 'stabilityai/stable-diffusion-xl-base-1.0' @@ -176,14 +191,14 @@ To generate a video from prompt with additional pose control controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( model_id, controlnet=controlnet, torch_dtype=torch.float16 - ).to('cuda') + ).to(device) # Set the attention processor pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) # fix latents for all frames - latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) + latents = torch.randn((1, 4, 128, 128), device=device, dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) prompt = "Darth Vader dancing in a desert" result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images @@ -224,9 +239,12 @@ To perform text-guided video editing (with [InstructPix2Pix](pix2pix)): import torch from diffusers import StableDiffusionInstructPix2PixPipeline from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor + from diffusers.utils.torch_utils import get_device + + device = get_device() model_id = "timbrooks/instruct-pix2pix" - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") + pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device) pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3)) prompt = "make it Van Gogh Starry Night style" @@ -267,20 +285,23 @@ can run with custom [DreamBooth](../../training/dreambooth) models, as shown bel import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor + from diffusers.utils.torch_utils import get_device + + device = get_device() # set model id to custom model model_id = "PAIR/text2video-zero-controlnet-canny-avatar" controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( model_id, controlnet=controlnet, torch_dtype=torch.float16 - ).to("cuda") + ).to(device) # Set the attention processor pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) # fix latents for all frames - latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1) + latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1) prompt = "oil painting of a beautiful girl avatar style" result = pipe(prompt=[prompt] * len(canny_edges), image=canny_edges, latents=latents).images diff --git a/docs/source/en/api/pipelines/unidiffuser.md b/docs/source/en/api/pipelines/unidiffuser.md index 7d767f2db530..738b4e166f71 100644 --- a/docs/source/en/api/pipelines/unidiffuser.md +++ b/docs/source/en/api/pipelines/unidiffuser.md @@ -47,8 +47,9 @@ Unconditional generation (where we start from only latents sampled from a standa import torch from diffusers import UniDiffuserPipeline +from diffusers.utils.torch_utils import get_device -device = "cuda" +device = get_device() model_id_or_path = "thu-ml/unidiffuser-v1" pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) pipe.to(device) @@ -96,8 +97,9 @@ Here is an example of sampling from the conditional image distribution (text-to- import torch from diffusers import UniDiffuserPipeline +from diffusers.utils.torch_utils import get_device -device = "cuda" +device = get_device() model_id_or_path = "thu-ml/unidiffuser-v1" pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) pipe.to(device) @@ -121,8 +123,9 @@ import torch from diffusers import UniDiffuserPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device -device = "cuda" +device = get_device() model_id_or_path = "thu-ml/unidiffuser-v1" pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) pipe.to(device) @@ -148,8 +151,9 @@ import torch from diffusers import UniDiffuserPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device -device = "cuda" +device = get_device() model_id_or_path = "thu-ml/unidiffuser-v1" pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) pipe.to(device) @@ -177,8 +181,9 @@ Similarly, text variation can be performed on an input prompt with a text-to-ima import torch from diffusers import UniDiffuserPipeline +from diffusers.utils.torch_utils import get_device -device = "cuda" +device = get_device() model_id_or_path = "thu-ml/unidiffuser-v1" pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) pipe.to(device) diff --git a/docs/source/en/api/pipelines/visualcloze.md b/docs/source/en/api/pipelines/visualcloze.md index 1a4f96a50d63..a3c55b193ea4 100644 --- a/docs/source/en/api/pipelines/visualcloze.md +++ b/docs/source/en/api/pipelines/visualcloze.md @@ -68,9 +68,12 @@ For comprehensive examples covering a wide range of tasks, please refer to the [ import torch from diffusers import VisualClozePipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16) -pipe.to("cuda") +pipe.to(device) # Load in-context images (make sure the paths are correct and accessible) image_paths = [ @@ -120,9 +123,12 @@ image_result.save("visualcloze.png") import torch from diffusers import VisualClozePipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16) -pipe.to("cuda") +pipe.to(device) # Load in-context images (make sure the paths are correct and accessible) image_paths = [ @@ -170,9 +176,12 @@ image_result.save("visualcloze.png") import torch from diffusers import VisualClozePipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16) -pipe.to("cuda") +pipe.to(device) # Load in-context images (make sure the paths are correct and accessible) image_paths = [ @@ -226,12 +235,15 @@ image_result.save("visualcloze.png") import torch from diffusers import VisualClozeGenerationPipeline, FluxFillPipeline as VisualClozeUpsamplingPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device from PIL import Image +device = get_device() + pipe = VisualClozeGenerationPipeline.from_pretrained( "VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) image_paths = [ # in-context examples @@ -267,7 +279,7 @@ image = pipe( # Stage 2 (optional): Upsample the generated image pipe_upsample = VisualClozeUpsamplingPipeline.from_pipe(pipe) -pipe_upsample.to("cuda") +pipe_upsample.to(device) mask_image = Image.new("RGB", image.size, (255, 255, 255)) diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md index e46aa55ad82a..ba912d55ab08 100644 --- a/docs/source/en/api/pipelines/wan.md +++ b/docs/source/en/api/pipelines/wan.md @@ -63,14 +63,17 @@ from diffusers import AutoModel, WanPipeline from diffusers.quantizers import PipelineQuantizationConfig from diffusers.hooks.group_offloading import apply_group_offloading from diffusers.utils import export_to_video, load_image +from diffusers.utils.torch_utils import get_device from transformers import UMT5EncoderModel +device = get_device() + text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16) vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32) transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) # group-offloading -onload_device = torch.device("cuda") +onload_device = torch.device(device) offload_device = torch.device("cpu") apply_group_offloading(text_encoder, onload_device=onload_device, @@ -92,7 +95,7 @@ pipeline = WanPipeline.from_pretrained( text_encoder=text_encoder, torch_dtype=torch.bfloat16 ) -pipeline.to("cuda") +pipeline.to(device) prompt = """ The camera rushes from far to near in a low-angle shot, @@ -128,8 +131,11 @@ import numpy as np from diffusers import AutoModel, WanPipeline from diffusers.hooks.group_offloading import apply_group_offloading from diffusers.utils import export_to_video, load_image +from diffusers.utils.torch_utils import get_device from transformers import UMT5EncoderModel +device = get_device() + text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16) vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32) transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) @@ -141,7 +147,7 @@ pipeline = WanPipeline.from_pretrained( text_encoder=text_encoder, torch_dtype=torch.bfloat16 ) -pipeline.to("cuda") +pipeline.to(device) # torch.compile pipeline.transformer.to(memory_format=torch.channels_last) @@ -187,8 +193,10 @@ import torch import torchvision.transforms.functional as TF from diffusers import AutoencoderKLWan, WanImageToVideoPipeline from diffusers.utils import export_to_video, load_image +from diffusers.utils.torch_utils import get_device from transformers import CLIPVisionModel +device = get_device() model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers" image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32) @@ -196,7 +204,7 @@ vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=to pipe = WanImageToVideoPipeline.from_pretrained( model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png") last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png") @@ -262,6 +270,9 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip from diffusers import AutoModel, WanPipeline from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler from diffusers.utils import export_to_video + from diffusers.utils.torch_utils import get_device + + device = get_device() vae = AutoModel.from_pretrained( "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32 @@ -272,7 +283,7 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip pipeline.scheduler = UniPCMultistepScheduler.from_config( pipeline.scheduler.config, flow_shift=5.0 ) - pipeline.to("cuda") + pipeline.to(device) pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie") pipeline.set_adapters("steamboat-willie") @@ -359,4 +370,4 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip ## WanPipelineOutput -[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput \ No newline at end of file +[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index d322d76be267..564c05131f5d 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -31,6 +31,9 @@ Now you can quantize a model by passing the `QuantoConfig` object to the `from_p ```python import torch from diffusers import FluxTransformer2DModel, QuantoConfig +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights_dtype="float8") @@ -42,7 +45,7 @@ transformer = FluxTransformer2DModel.from_pretrained( ) pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) -pipe.to("cuda") +pipe.to(device) prompt = "A cat holding a sign that says hello world" image = pipe( @@ -117,6 +120,9 @@ Currently the Quanto backend supports `torch.compile` for the following quantiza ```python import torch from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig +from diffusers.utils.torch_utils import get_device + +device = get_device() model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights_dtype="int8") @@ -131,7 +137,7 @@ transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True) pipe = FluxPipeline.from_pretrained( model_id, transformer=transformer, torch_dtype=torch_dtype ) -pipe.to("cuda") +pipe.to(device) images = pipe("A cat holding a sign that says hello").images[0] images.save("flux-quanto-compile.png") ``` diff --git a/docs/source/en/using-diffusers/controlnet.md b/docs/source/en/using-diffusers/controlnet.md index 4aa5abd04f3b..f376b56a48d2 100644 --- a/docs/source/en/using-diffusers/controlnet.md +++ b/docs/source/en/using-diffusers/controlnet.md @@ -52,14 +52,17 @@ Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` pa ```py import torch from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device from diffusers import FluxControlNetPipeline, FluxControlNetModel +device = get_device() + controlnet = FluxControlNetModel.from_pretrained( "InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16 ) pipeline = FluxControlNetPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16 -).to("cuda") +).to(device) prompt = """ A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. @@ -103,14 +106,16 @@ from PIL import Image from transformers import DPTImageProcessor, DPTForDepthEstimation from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device +device = get_device() -depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") +depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device) feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") def get_depth_map(image): - image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda") - with torch.no_grad(), torch.autocast("cuda"): + image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device) + with torch.no_grad(), torch.autocast(device): depth_map = depth_estimator(image).predicted_depth depth_map = torch.nn.functional.interpolate( @@ -143,7 +148,7 @@ pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( controlnet=controlnet, vae=vae, torch_dtype=torch.float16, -).to("cuda") +).to(device) prompt = """ A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. @@ -260,6 +265,9 @@ Pass the ControlNets as a list to the pipeline and resize the images to the expe ```py import torch from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL +from diffusers.utils.torch_utils import get_device + +device = get_device() controlnets = [ ControlNetModel.from_pretrained( @@ -273,7 +281,7 @@ controlnets = [ vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) pipeline = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16 -).to("cuda") +).to(device) prompt = """ a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby, @@ -316,6 +324,9 @@ pipeline( import torch from diffusers.utils import load_iamge from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel +from diffusers.utils.torch_utils import get_device + +device = get_device() controlnet = ControlNetModel.from_pretrained( "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16 @@ -324,7 +335,7 @@ pipeline = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 -).to("cuda") +).to(device) canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png") pipeline( @@ -343,4 +354,4 @@ pipeline( Generated image (Guess mode)
generated image
- \ No newline at end of file + diff --git a/docs/source/en/using-diffusers/dreambooth.md b/docs/source/en/using-diffusers/dreambooth.md index 6c37124cb7ff..4c620adbb110 100644 --- a/docs/source/en/using-diffusers/dreambooth.md +++ b/docs/source/en/using-diffusers/dreambooth.md @@ -21,15 +21,18 @@ Load the DreamBooth checkpoint with [`~DiffusionPipeline.from_pretrained`] and i ```py import torch from diffusers import AutoPipelineForText2Image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipeline = AutoPipelineForText2Image.from_pretrained( "sd-dreambooth-library/herge-style", torch_dtype=torch.float16 -).to("cuda") +).to(device) prompt = "A cute sks herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration" pipeline(prompt).images[0] ```
-
\ No newline at end of file + diff --git a/docs/source/en/using-diffusers/omnigen.md b/docs/source/en/using-diffusers/omnigen.md index 2880fedb3392..36de8d791c31 100644 --- a/docs/source/en/using-diffusers/omnigen.md +++ b/docs/source/en/using-diffusers/omnigen.md @@ -37,12 +37,15 @@ You can try setting the `height` and `width` parameters to generate images with ```python import torch from diffusers import OmniGenPipeline +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt = "Realistic photo. A young woman sits on a sofa, holding a book and facing the camera. She wears delicate silver hoop earrings adorned with tiny, sparkling diamonds that catch the light, with her long chestnut hair cascading over her shoulders. Her eyes are focused and gentle, framed by long, dark lashes. She is dressed in a cozy cream sweater, which complements her warm, inviting smile. Behind her, there is a table with a cup of water in a sleek, minimalist blue mug. The background is a serene indoor setting with soft natural light filtering through a window, adorned with tasteful art and flowers, creating a cozy and peaceful ambiance. 4K, HD." image = pipe( @@ -68,13 +71,16 @@ It is recommended to enable `use_input_image_size_as_output` to keep the edited ```python import torch from diffusers import OmniGenPipeline -from diffusers.utils import load_image +from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt="<|image_1|> Remove the woman's earrings. Replace the mug with a clear glass filled with sparkling iced cola." input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/t2i_woman_with_book.png")] @@ -127,13 +133,16 @@ OmniGen can handle several classic computer vision tasks. As shown below, OmniGe ```python import torch from diffusers import OmniGenPipeline -from diffusers.utils import load_image +from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt="Detect the skeleton of human in this image: <|image_1|>" input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/edit.png")] @@ -181,13 +190,16 @@ OmniGen can also directly use relevant information from input images to generate ```python import torch from diffusers import OmniGenPipeline -from diffusers.utils import load_image +from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt="Following the pose of this image <|image_1|>, generate a new photo: A young boy is sitting on a sofa in the library, holding a book. His hair is neatly combed, and a faint smile plays on his lips, with a few freckles scattered across his cheeks. The library is quiet, with rows of shelves filled with books stretching out behind him." input_images=[load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/edit.png")] @@ -218,12 +230,15 @@ Additionally, OmniGen can extract desired objects from an image containing multi import torch from diffusers import OmniGenPipeline from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt="A man and a woman are sitting at a classroom desk. The man is the man with yellow hair in <|image_1|>. The woman is the woman on the left of <|image_2|>" input_image_1 = load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/3.png") @@ -259,13 +274,16 @@ image.save("output.png") ```py import torch from diffusers import OmniGenPipeline -from diffusers.utils import load_image +from diffusers.utils import load_image +from diffusers.utils.torch_utils import get_device + +device = get_device() pipe = OmniGenPipeline.from_pretrained( "Shitao/OmniGen-v1-diffusers", torch_dtype=torch.bfloat16 ) -pipe.to("cuda") +pipe.to(device) prompt="A woman is walking down the street, wearing a white long-sleeve blouse with lace details on the sleeves, paired with a blue pleated skirt. The woman is <|image_1|>. The long-sleeve blouse and a pleated skirt are <|image_2|>." input_image_1 = load_image("https://raw.githubusercontent.com/VectorSpaceLab/OmniGen/main/imgs/docs_img/emma.jpeg") diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md index 67d1fd118e4d..322572ccce31 100644 --- a/docs/source/en/using-diffusers/text-img2vid.md +++ b/docs/source/en/using-diffusers/text-img2vid.md @@ -28,14 +28,17 @@ import numpy as np from diffusers import AutoModel, WanPipeline from diffusers.hooks.group_offloading import apply_group_offloading from diffusers.utils import export_to_video, load_image +from diffusers.utils.torch_utils import get_device from transformers import UMT5EncoderModel +device = get_device() + text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16) vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32) transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) # group-offloading -onload_device = torch.device("cuda") +onload_device = torch.device(device) offload_device = torch.device("cpu") apply_group_offloading(text_encoder, onload_device=onload_device, @@ -57,7 +60,7 @@ pipeline = WanPipeline.from_pretrained( text_encoder=text_encoder, torch_dtype=torch.bfloat16 ) -pipeline.to("cuda") +pipeline.to(device) prompt = """ The camera rushes from far to near in a low-angle shot, @@ -124,6 +127,9 @@ import torch from diffusers import LTXPipeline, AutoModel from diffusers.hooks import apply_group_offloading from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() # fp8 layerwise weight-casting transformer = AutoModel.from_pretrained( @@ -138,7 +144,7 @@ transformer.enable_layerwise_casting( pipeline = LTXPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, torch_dtype=torch.bfloat16) # group-offloading -onload_device = torch.device("cuda") +onload_device = torch.device(device) offload_device = torch.device("cpu") pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True) apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2) @@ -184,10 +190,13 @@ Some video models require more specific `num_frames` values for inference. For e import torch from diffusers import LTXPipeline from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() pipeline = LTXPipeline.from_pretrained( "Lightricks/LTX-Video", torch_dtype=torch.bfloat16 -).to("cuda") +).to(device) prompt = """ A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman @@ -218,11 +227,14 @@ Guidance scale or "cfg" controls how closely the generated frames adhere to the import torch from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() pipeline = CogVideoXPipeline.from_pretrained( "THUDM/CogVideoX-2b", torch_dtype=torch.float16 -).to("cuda") +).to(device) prompt = """ A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over @@ -251,6 +263,9 @@ import torch from diffusers import WanPipeline from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() vae = AutoencoderKLWan.from_pretrained( "Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32 @@ -261,7 +276,7 @@ pipeline = WanPipeline.from_pretrained( pipeline.scheduler = UniPCMultistepScheduler.from_config( pipeline.scheduler.config, flow_shift=5.0 ) -pipeline.to("cuda") +pipeline.to(device) pipeline.load_lora_weights("benjamin-paine/steamboat-willie-14b", adapter_name="steamboat-willie") pipeline.set_adapters("steamboat-willie") @@ -301,14 +316,17 @@ import numpy as np from diffusers import AutoModel, WanPipeline from diffusers.hooks.group_offloading import apply_group_offloading from diffusers.utils import export_to_video, load_image +from diffusers.utils.torch_utils import get_device from transformers import UMT5EncoderModel +device = get_device() + text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16) vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32) transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) # group-offloading -onload_device = torch.device("cuda") +onload_device = torch.device(device) offload_device = torch.device("cpu") apply_group_offloading(text_encoder, onload_device=onload_device, @@ -330,7 +348,7 @@ pipeline = WanPipeline.from_pretrained( text_encoder=text_encoder, torch_dtype=torch.bfloat16 ) -pipeline.to("cuda") +pipeline.to(device) prompt = """ The camera rushes from far to near in a low-angle shot, @@ -368,6 +386,9 @@ from diffusers.quantizers import PipelineQuantizationConfig from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler from transformers import UMT5EncoderModel from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() # quantize transformer and text encoder weights with bitsandbytes pipeline_quant_config = PipelineQuantizationConfig( @@ -385,7 +406,7 @@ pipeline = WanPipeline.from_pretrained( pipeline.scheduler = UniPCMultistepScheduler.from_config( pipeline.scheduler.config, flow_shift=5.0 ) -pipeline.to("cuda") +pipeline.to(device) pipeline.load_lora_weights("benjamin-paine/steamboat-willie-14b", adapter_name="steamboat-willie") pipeline.set_adapters("steamboat-willie") @@ -419,11 +440,14 @@ The example below compiles the transformer in the pipeline and uses the `"max-au import torch from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel from diffusers.utils import export_to_video +from diffusers.utils.torch_utils import get_device + +device = get_device() pipeline = CogVideoXPipeline.from_pretrained( "THUDM/CogVideoX-2b", torch_dtype=torch.float16 -).to("cuda") +).to(device) # torch.compile pipeline.transformer.to(memory_format=torch.channels_last) @@ -444,4 +468,4 @@ video = pipeline( num_inference_steps=50 ).frames[0] export_to_video(video, "output.mp4", fps=8) -``` \ No newline at end of file +```