Merge pull request #3819 from vladmandic/dev

Disty0 · web-flow · commit 6633e8e5e758 · 2025-03-14T20:05:04.000+03:00
Dev
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,16 +1,20 @@
 # Change Log for SD.Next
 
-## Update for 2025-03-12
+## Update for 2025-03-14
 
 - fix installer not starting when older version of rich is installed  
 - fix circular imports when debug flags are enabled  
 - fix cuda errors with directml  
 - fix memory stats not displaying the ram usage  
+- fix runpod memory limit reporting  
+- fix remote vae not being stored in metadata, thanks @iDeNoh  
+- add --upgrade to torch_command when using --use-nightly for ipex and rocm  
 - **ipex**
   - add xpu to profiler  
   - fix untyped_storage, torch.eye and torch.cuda.device ops  
   - fix torch 2.7 compatibility  
   - fix performance with balanced offload  
+  - fix triton and torch.compile  
 
 ## Update for 2025-02-28
 
diff --git a/installer.py b/installer.py
@@ -667,11 +667,11 @@ def install_rocm_zluda():
 
         if args.use_nightly:
             if rocm.version is None or float(rocm.version) >= 6.3: # assume the latest if version check fails
-                torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.3')
+                torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.3')
             elif rocm.version == "6.2": # use rocm 6.2.4 instead of 6.2 as torch+rocm6.2 doesn't exists
-                torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4')
+                torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4')
             else: # oldest rocm version on nightly is 6.1
-                torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
+                torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
         else:
             if rocm.version is None or float(rocm.version) >= 6.2: # assume the latest if version check fails
                 # use rocm 6.2.4 instead of 6.2 as torch==2.6.0+rocm6.2 doesn't exists
@@ -735,7 +735,7 @@ def install_ipex(torch_command):
     #    os.environ.setdefault('IGC_EnableDPEmulation', '1') # FP64 Emulation
 
     if args.use_nightly:
-        torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu')
+        torch_command = os.environ.get('TORCH_COMMAND', '--upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu')
     else:
         torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.6.0+xpu torchvision==0.21.0+xpu --index-url https://download.pytorch.org/whl/xpu')
 
diff --git a/modules/call_queue.py b/modules/call_queue.py
@@ -83,12 +83,14 @@ def f(*args, extra_outputs_array=extra_outputs, **kwargs):
             vram = {k: v//1048576 for k, v in mem_mon_read.items()}
             peak = max(vram['active_peak'], vram['reserved_peak'], vram['used'])
             used = round(100.0 * peak / vram['total']) if vram['total'] > 0 else 0
-            if used > 0:
-                gpu += f"| GPU {peak} MB {used}%"
+            if peak > 0:
+                gpu += f"| GPU {peak} MB"
+                gpu += f" {used}%" if used > 0 else ''
                 gpu += f" | retries {retries} oom {ooms}" if retries > 0 or ooms > 0 else ''
         ram = shared.ram_stats()
         if ram['used'] > 0:
-            cpu += f"| RAM {ram['used']} GB {round(100.0 * ram['used'] / ram['total'])}%"
+            cpu += f"| RAM {ram['used']} GB"
+            cpu += f" {round(100.0 * ram['used'] / ram['total'])}%" if ram['total'] > 0 else ''
         if isinstance(res, list):
             res[-1] += f"<div class='performance'><p>Time: {elapsed_text} | {summary} {gpu} {cpu}</p></div>"
         return tuple(res)
diff --git a/modules/intel/ipex/__init__.py b/modules/intel/ipex/__init__.py
@@ -18,7 +18,10 @@ def ipex_init(): # pylint: disable=too-many-statements
         if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
             return True, "Skipping IPEX hijack"
         else:
-            try: # force xpu device on torch compile and triton
+            try:
+                # force xpu device on torch compile and triton
+                # import inductor utils to get around lazy import
+                from torch._inductor import utils as torch_inductor_utils # pylint: disable=import-error, unused-import
                 torch._inductor.utils.GPU_TYPES = ["xpu"]
                 torch._inductor.utils.get_gpu_type = lambda *args, **kwargs: "xpu"
                 from triton import backends as triton_backends # pylint: disable=import-error
@@ -187,11 +190,13 @@ def ipex_init(): # pylint: disable=too-many-statements
                 ipex._C._DeviceProperties.multi_processor_count = ipex._C._DeviceProperties.gpu_subslice_count
                 ipex._C._DeviceProperties.major = 12
                 ipex._C._DeviceProperties.minor = 1
+                ipex._C._DeviceProperties.L2_cache_size = 16*1024*1024 # A770 and A750
             else:
                 torch._C._cuda_getCurrentRawStream = torch._C._xpu_getCurrentRawStream
                 torch._C._XpuDeviceProperties.multi_processor_count = torch._C._XpuDeviceProperties.gpu_subslice_count
                 torch._C._XpuDeviceProperties.major = 12
                 torch._C._XpuDeviceProperties.minor = 1
+                torch._C._XpuDeviceProperties.L2_cache_size = 16*1024*1024 # A770 and A750
 
             # Fix functions with ipex:
             # torch.xpu.mem_get_info always returns the total memory as free memory
@@ -200,14 +205,15 @@ def ipex_init(): # pylint: disable=too-many-statements
             torch._utils._get_available_device_type = lambda: "xpu"
             torch.has_cuda = True
             torch.cuda.has_half = True
-            torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
+            torch.cuda.is_bf16_supported = getattr(torch.xpu, "is_bf16_supported", lambda *args, **kwargs: True)
             torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
             torch.backends.cuda.is_built = lambda *args, **kwargs: True
             torch.version.cuda = "12.1"
-            torch.cuda.get_arch_list = lambda: ["ats-m150", "pvc"]
+            torch.cuda.get_arch_list = getattr(torch.xpu, "get_arch_list", lambda: ["pvc", "dg2", "ats-m150"])
             torch.cuda.get_device_capability = lambda *args, **kwargs: (12,1)
             torch.cuda.get_device_properties.major = 12
             torch.cuda.get_device_properties.minor = 1
+            torch.cuda.get_device_properties.L2_cache_size = 16*1024*1024 # A770 and A750
             torch.cuda.ipc_collect = lambda *args, **kwargs: None
             torch.cuda.utilization = lambda *args, **kwargs: 0
 
diff --git a/modules/intel/ipex/hijacks.py b/modules/intel/ipex/hijacks.py
@@ -332,14 +332,6 @@ def torch_load(f, map_location=None, *args, **kwargs):
     else:
         return original_torch_load(f, *args, map_location=map_location, **kwargs)
 
-original_torch_Generator = torch.Generator
-@wraps(torch.Generator)
-def torch_Generator(device=None):
-    if check_cuda(device):
-        return original_torch_Generator(return_xpu(device))
-    else:
-        return original_torch_Generator(device)
-
 @wraps(torch.cuda.synchronize)
 def torch_cuda_synchronize(device=None):
     if check_cuda(device):
@@ -355,6 +347,17 @@ def torch_cuda_device(device):
         return torch.xpu.device(device)
 
 
+# torch.Generator has to be a class for isinstance checks
+original_torch_Generator = torch.Generator
+class torch_Generator(original_torch_Generator):
+    def __new__(self, device=None):
+        # can't hijack __init__ because of C override so use return super().__new__
+        if check_cuda(device):
+            return super().__new__(self, return_xpu(device))
+        else:
+            return super().__new__(self, device)
+
+
 # Hijack Functions:
 def ipex_hijacks():
     global device_supports_fp64, can_allocate_plus_4gb
@@ -374,10 +377,12 @@ def ipex_hijacks():
     torch.linspace = torch_linspace
     torch.eye = torch_eye
     torch.load = torch_load
-    torch.Generator = torch_Generator
     torch.cuda.synchronize = torch_cuda_synchronize
     torch.cuda.device = torch_cuda_device
 
+    torch.Generator = torch_Generator
+    torch._C.Generator = torch_Generator
+
     torch.backends.cuda.sdp_kernel = return_null_context
     torch.nn.DataParallel = DummyDataParallel
     torch.UntypedStorage.is_cuda = is_cuda
diff --git a/modules/memstats.py b/modules/memstats.py
@@ -24,14 +24,18 @@ def get_docker_limit():
             docker_limit = float(f.read())
     except Exception:
         docker_limit = sys.float_info.max
+    if docker_limit == 0:
+        docker_limit = sys.float_info.max
     return docker_limit
 
 
 def get_runpod_limit():
     global runpod_limit # pylint: disable=global-statement
     if runpod_limit is not None:
         return runpod_limit
-    runpod_limit = float(os.environ.get('RUNPOD_MEM_GB', sys.float_info.max))
+    runpod_limit = float(os.environ.get('RUNPOD_MEM_GB', 0)) * 1024 * 1024 * 1024
+    if runpod_limit == 0:
+        runpod_limit = sys.float_info.max
     return runpod_limit
 
 
diff --git a/modules/processing_info.py b/modules/processing_info.py
@@ -74,6 +74,8 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No
         args["VAE"] = (None if not shared.opts.add_model_name_to_info or sd_vae.loaded_vae_file is None else os.path.splitext(os.path.basename(sd_vae.loaded_vae_file))[0])
     elif p.vae_type == 'Tiny':
         args["VAE"] = 'TAESD'
+    elif p.vae_type == 'Remote':
+        args["VAE"] = 'Remote'
     if shared.opts.add_model_name_to_info and getattr(shared.sd_model, 'sd_checkpoint_info', None) is not None:
         args["Model"] = shared.sd_model.sd_checkpoint_info.model_name.replace(',', '').replace(':', '')
     if shared.opts.add_model_hash_to_info and getattr(shared.sd_model, 'sd_model_hash', None) is not None:
diff --git a/modules/ui_control.py b/modules/ui_control.py
@@ -33,15 +33,20 @@ def return_stats(t: float = None):
     gpu = ''
     cpu = ''
     if not shared.mem_mon.disabled:
-        vram = {k: -(v//-(1024*1024)) for k, v in shared.mem_mon.read().items()}
+        mem_mon_read = shared.mem_mon.read()
+        ooms = mem_mon_read.pop("oom")
+        retries = mem_mon_read.pop("retries")
+        vram = {k: v//1048576 for k, v in mem_mon_read.items()}
         peak = max(vram['active_peak'], vram['reserved_peak'], vram['used'])
         used = round(100.0 * peak / vram['total']) if vram['total'] > 0 else 0
-        if used > 0:
-            gpu += f"| GPU {peak} MB {used}%"
-            gpu += f" | retries {vram['retries']} oom {vram['oom']}" if vram.get('retries', 0) > 0 or vram.get('oom', 0) > 0 else ''
-        ram = shared.ram_stats()
-        if ram['used'] > 0:
-            cpu += f"| RAM {ram['used']} GB {round(100.0 * ram['used'] / ram['total'])}%"
+        if peak > 0:
+            gpu += f"| GPU {peak} MB"
+            gpu += f" {used}%" if used > 0 else ''
+            gpu += f" | retries {retries} oom {ooms}" if retries > 0 or ooms > 0 else ''
+    ram = shared.ram_stats()
+    if ram['used'] > 0:
+        cpu += f"| RAM {ram['used']} GB"
+        cpu += f" {round(100.0 * ram['used'] / ram['total'])}%" if ram['total'] > 0 else ''
     return f"<div class='performance'><p>{elapsed_text} {summary} {gpu} {cpu}</p></div>"