Skip to content

Commit 6633e8e

Browse files
authored
Merge pull request #3819 from vladmandic/dev
Dev
2 parents 8513cd8 + fdf4999 commit 6633e8e

8 files changed

+56
-28
lines changed

CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
# Change Log for SD.Next
22

3-
## Update for 2025-03-12
3+
## Update for 2025-03-14
44

55
- fix installer not starting when older version of rich is installed
66
- fix circular imports when debug flags are enabled
77
- fix cuda errors with directml
88
- fix memory stats not displaying the ram usage
9+
- fix runpod memory limit reporting
10+
- fix remote vae not being stored in metadata, thanks @iDeNoh
11+
- add --upgrade to torch_command when using --use-nightly for ipex and rocm
912
- **ipex**
1013
- add xpu to profiler
1114
- fix untyped_storage, torch.eye and torch.cuda.device ops
1215
- fix torch 2.7 compatibility
1316
- fix performance with balanced offload
17+
- fix triton and torch.compile
1418

1519
## Update for 2025-02-28
1620

installer.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -667,11 +667,11 @@ def install_rocm_zluda():
667667

668668
if args.use_nightly:
669669
if rocm.version is None or float(rocm.version) >= 6.3: # assume the latest if version check fails
670-
torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.3')
670+
torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.3')
671671
elif rocm.version == "6.2": # use rocm 6.2.4 instead of 6.2 as torch+rocm6.2 doesn't exists
672-
torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4')
672+
torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4')
673673
else: # oldest rocm version on nightly is 6.1
674-
torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
674+
torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
675675
else:
676676
if rocm.version is None or float(rocm.version) >= 6.2: # assume the latest if version check fails
677677
# use rocm 6.2.4 instead of 6.2 as torch==2.6.0+rocm6.2 doesn't exists
@@ -735,7 +735,7 @@ def install_ipex(torch_command):
735735
# os.environ.setdefault('IGC_EnableDPEmulation', '1') # FP64 Emulation
736736

737737
if args.use_nightly:
738-
torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu')
738+
torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu')
739739
else:
740740
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.6.0+xpu torchvision==0.21.0+xpu --index-url https://download.pytorch.org/whl/xpu')
741741

modules/call_queue.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,14 @@ def f(*args, extra_outputs_array=extra_outputs, **kwargs):
8383
vram = {k: v//1048576 for k, v in mem_mon_read.items()}
8484
peak = max(vram['active_peak'], vram['reserved_peak'], vram['used'])
8585
used = round(100.0 * peak / vram['total']) if vram['total'] > 0 else 0
86-
if used > 0:
87-
gpu += f"| GPU {peak} MB {used}%"
86+
if peak > 0:
87+
gpu += f"| GPU {peak} MB"
88+
gpu += f" {used}%" if used > 0 else ''
8889
gpu += f" | retries {retries} oom {ooms}" if retries > 0 or ooms > 0 else ''
8990
ram = shared.ram_stats()
9091
if ram['used'] > 0:
91-
cpu += f"| RAM {ram['used']} GB {round(100.0 * ram['used'] / ram['total'])}%"
92+
cpu += f"| RAM {ram['used']} GB"
93+
cpu += f" {round(100.0 * ram['used'] / ram['total'])}%" if ram['total'] > 0 else ''
9294
if isinstance(res, list):
9395
res[-1] += f"<div class='performance'><p>Time: {elapsed_text} | {summary} {gpu} {cpu}</p></div>"
9496
return tuple(res)

modules/intel/ipex/__init__.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ def ipex_init(): # pylint: disable=too-many-statements
1818
if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
1919
return True, "Skipping IPEX hijack"
2020
else:
21-
try: # force xpu device on torch compile and triton
21+
try:
22+
# force xpu device on torch compile and triton
23+
# import inductor utils to get around lazy import
24+
from torch._inductor import utils as torch_inductor_utils # pylint: disable=import-error, unused-import
2225
torch._inductor.utils.GPU_TYPES = ["xpu"]
2326
torch._inductor.utils.get_gpu_type = lambda *args, **kwargs: "xpu"
2427
from triton import backends as triton_backends # pylint: disable=import-error
@@ -187,11 +190,13 @@ def ipex_init(): # pylint: disable=too-many-statements
187190
ipex._C._DeviceProperties.multi_processor_count = ipex._C._DeviceProperties.gpu_subslice_count
188191
ipex._C._DeviceProperties.major = 12
189192
ipex._C._DeviceProperties.minor = 1
193+
ipex._C._DeviceProperties.L2_cache_size = 16*1024*1024 # A770 and A750
190194
else:
191195
torch._C._cuda_getCurrentRawStream = torch._C._xpu_getCurrentRawStream
192196
torch._C._XpuDeviceProperties.multi_processor_count = torch._C._XpuDeviceProperties.gpu_subslice_count
193197
torch._C._XpuDeviceProperties.major = 12
194198
torch._C._XpuDeviceProperties.minor = 1
199+
torch._C._XpuDeviceProperties.L2_cache_size = 16*1024*1024 # A770 and A750
195200

196201
# Fix functions with ipex:
197202
# torch.xpu.mem_get_info always returns the total memory as free memory
@@ -200,14 +205,15 @@ def ipex_init(): # pylint: disable=too-many-statements
200205
torch._utils._get_available_device_type = lambda: "xpu"
201206
torch.has_cuda = True
202207
torch.cuda.has_half = True
203-
torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
208+
torch.cuda.is_bf16_supported = getattr(torch.xpu, "is_bf16_supported", lambda *args, **kwargs: True)
204209
torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
205210
torch.backends.cuda.is_built = lambda *args, **kwargs: True
206211
torch.version.cuda = "12.1"
207-
torch.cuda.get_arch_list = lambda: ["ats-m150", "pvc"]
212+
torch.cuda.get_arch_list = getattr(torch.xpu, "get_arch_list", lambda: ["pvc", "dg2", "ats-m150"])
208213
torch.cuda.get_device_capability = lambda *args, **kwargs: (12,1)
209214
torch.cuda.get_device_properties.major = 12
210215
torch.cuda.get_device_properties.minor = 1
216+
torch.cuda.get_device_properties.L2_cache_size = 16*1024*1024 # A770 and A750
211217
torch.cuda.ipc_collect = lambda *args, **kwargs: None
212218
torch.cuda.utilization = lambda *args, **kwargs: 0
213219

modules/intel/ipex/hijacks.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -332,14 +332,6 @@ def torch_load(f, map_location=None, *args, **kwargs):
332332
else:
333333
return original_torch_load(f, *args, map_location=map_location, **kwargs)
334334

335-
original_torch_Generator = torch.Generator
336-
@wraps(torch.Generator)
337-
def torch_Generator(device=None):
338-
if check_cuda(device):
339-
return original_torch_Generator(return_xpu(device))
340-
else:
341-
return original_torch_Generator(device)
342-
343335
@wraps(torch.cuda.synchronize)
344336
def torch_cuda_synchronize(device=None):
345337
if check_cuda(device):
@@ -355,6 +347,17 @@ def torch_cuda_device(device):
355347
return torch.xpu.device(device)
356348

357349

350+
# torch.Generator has to be a class for isinstance checks
351+
original_torch_Generator = torch.Generator
352+
class torch_Generator(original_torch_Generator):
353+
def __new__(self, device=None):
354+
# can't hijack __init__ because of C override so use return super().__new__
355+
if check_cuda(device):
356+
return super().__new__(self, return_xpu(device))
357+
else:
358+
return super().__new__(self, device)
359+
360+
358361
# Hijack Functions:
359362
def ipex_hijacks():
360363
global device_supports_fp64, can_allocate_plus_4gb
@@ -374,10 +377,12 @@ def ipex_hijacks():
374377
torch.linspace = torch_linspace
375378
torch.eye = torch_eye
376379
torch.load = torch_load
377-
torch.Generator = torch_Generator
378380
torch.cuda.synchronize = torch_cuda_synchronize
379381
torch.cuda.device = torch_cuda_device
380382

383+
torch.Generator = torch_Generator
384+
torch._C.Generator = torch_Generator
385+
381386
torch.backends.cuda.sdp_kernel = return_null_context
382387
torch.nn.DataParallel = DummyDataParallel
383388
torch.UntypedStorage.is_cuda = is_cuda

modules/memstats.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ def get_docker_limit():
2424
docker_limit = float(f.read())
2525
except Exception:
2626
docker_limit = sys.float_info.max
27+
if docker_limit == 0:
28+
docker_limit = sys.float_info.max
2729
return docker_limit
2830

2931

3032
def get_runpod_limit():
3133
global runpod_limit # pylint: disable=global-statement
3234
if runpod_limit is not None:
3335
return runpod_limit
34-
runpod_limit = float(os.environ.get('RUNPOD_MEM_GB', sys.float_info.max))
36+
runpod_limit = float(os.environ.get('RUNPOD_MEM_GB', 0)) * 1024 * 1024 * 1024
37+
if runpod_limit == 0:
38+
runpod_limit = sys.float_info.max
3539
return runpod_limit
3640

3741

modules/processing_info.py

+2
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No
7474
args["VAE"] = (None if not shared.opts.add_model_name_to_info or sd_vae.loaded_vae_file is None else os.path.splitext(os.path.basename(sd_vae.loaded_vae_file))[0])
7575
elif p.vae_type == 'Tiny':
7676
args["VAE"] = 'TAESD'
77+
elif p.vae_type == 'Remote':
78+
args["VAE"] = 'Remote'
7779
if shared.opts.add_model_name_to_info and getattr(shared.sd_model, 'sd_checkpoint_info', None) is not None:
7880
args["Model"] = shared.sd_model.sd_checkpoint_info.model_name.replace(',', '').replace(':', '')
7981
if shared.opts.add_model_hash_to_info and getattr(shared.sd_model, 'sd_model_hash', None) is not None:

modules/ui_control.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,20 @@ def return_stats(t: float = None):
3333
gpu = ''
3434
cpu = ''
3535
if not shared.mem_mon.disabled:
36-
vram = {k: -(v//-(1024*1024)) for k, v in shared.mem_mon.read().items()}
36+
mem_mon_read = shared.mem_mon.read()
37+
ooms = mem_mon_read.pop("oom")
38+
retries = mem_mon_read.pop("retries")
39+
vram = {k: v//1048576 for k, v in mem_mon_read.items()}
3740
peak = max(vram['active_peak'], vram['reserved_peak'], vram['used'])
3841
used = round(100.0 * peak / vram['total']) if vram['total'] > 0 else 0
39-
if used > 0:
40-
gpu += f"| GPU {peak} MB {used}%"
41-
gpu += f" | retries {vram['retries']} oom {vram['oom']}" if vram.get('retries', 0) > 0 or vram.get('oom', 0) > 0 else ''
42-
ram = shared.ram_stats()
43-
if ram['used'] > 0:
44-
cpu += f"| RAM {ram['used']} GB {round(100.0 * ram['used'] / ram['total'])}%"
42+
if peak > 0:
43+
gpu += f"| GPU {peak} MB"
44+
gpu += f" {used}%" if used > 0 else ''
45+
gpu += f" | retries {retries} oom {ooms}" if retries > 0 or ooms > 0 else ''
46+
ram = shared.ram_stats()
47+
if ram['used'] > 0:
48+
cpu += f"| RAM {ram['used']} GB"
49+
cpu += f" {round(100.0 * ram['used'] / ram['total'])}%" if ram['total'] > 0 else ''
4550
return f"<div class='performance'><p>{elapsed_text} {summary} {gpu} {cpu}</p></div>"
4651

4752

0 commit comments

Comments
 (0)