NVIDIA · leofang · Jul 1, 2025 · Jun 21, 2025 · Jun 21, 2025 · Jun 22, 2025
diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/experimental/_graph.py
@@ -476,7 +476,7 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi
             default_value = 0
             flags = 0
 
-        status, _, graph, _, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
+        status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
-        status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
+        status, _, graph, *_ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
-        status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
+        status, _, graph, *_ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
         if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
             raise RuntimeError("Cannot create a conditional handle when graph is not being built")
 
@@ -486,20 +486,22 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi
 
     def _cond_with_params(self, node_params) -> GraphBuilder:
         # Get current capture info to ensure we're in a valid state
-        status, _, graph, dependencies, num_dependencies = handle_return(
+        status, _, graph, *deps_info, num_dependencies = handle_return(
             driver.cuStreamGetCaptureInfo(self._mnff.stream.handle)
         )
         if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
             raise RuntimeError("Cannot add conditional node when not actively capturing")
 
         # Add the conditional node to the graph
-        node = handle_return(driver.cuGraphAddNode(graph, dependencies, num_dependencies, node_params))
+        deps_info_update = [
+            [handle_return(driver.cuGraphAddNode(graph, *deps_info, num_dependencies, node_params))]
+        ] + [None] * (len(deps_info) - 1)
 
         # Update the stream's capture dependencies
         handle_return(
             driver.cuStreamUpdateCaptureDependencies(
                 self._mnff.stream.handle,
-                [node],  # dependencies
+                *deps_info_update,  # dependencies, edgeData
                 1,  # numDependencies
                 driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
             )
@@ -677,17 +679,23 @@ def add_child(self, child_graph: GraphBuilder):
             raise ValueError("Parent graph is not being built.")
 
         stream_handle = self._mnff.stream.handle
-        _, _, graph_out, dependencies_out, num_dependencies_out = handle_return(
+        _, _, graph_out, *deps_info_out, num_dependencies_out = handle_return(
             driver.cuStreamGetCaptureInfo(stream_handle)
         )
 
-        child_node = handle_return(
-            driver.cuGraphAddChildGraphNode(graph_out, dependencies_out, num_dependencies_out, child_graph._mnff.graph)
-        )
+        deps_info_update = [
+            [
+                handle_return(
+                    driver.cuGraphAddChildGraphNode(
+                        graph_out, deps_info_out[0], num_dependencies_out, child_graph._mnff.graph
+                    )
+                )
+            ]
+        ] + [None] * (len(deps_info_out) - 1)
         handle_return(
             driver.cuStreamUpdateCaptureDependencies(
                 stream_handle,
-                [child_node],
+                *deps_info_update,  # dependencies, edgeData
                 1,
                 driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
             )

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -1,7 +1,7 @@
 # Copyright 2024 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import helpers
 
 try:
     from cuda.bindings import driver
@@ -65,5 +65,4 @@ def pop_all_contexts():
     return pop_all_contexts
 
 
-# TODO: make the fixture more sophisticated using path finder
-skipif_need_cuda_headers = pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need CUDA header")
+skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
diff --git a/cuda_core/tests/helpers.py b/cuda_core/tests/helpers.py
@@ -0,0 +1,16 @@
+# Copyright 2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+CUDA_PATH = os.environ.get("CUDA_PATH")
+CUDA_INCLUDE_PATH = None
+CCCL_INCLUDE_PATHS = None
+if CUDA_PATH is not None:
+    path = os.path.join(CUDA_PATH, "include")
+    if os.path.isdir(path):
+        CUDA_INCLUDE_PATH = path
+        CCCL_INCLUDE_PATHS = (path,)
+        path = os.path.join(path, "cccl")
+        if os.path.isdir(path):
+            CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -6,6 +6,7 @@
 import platform
 import time
 
+import helpers
 import numpy as np
 import pytest
 from conftest import skipif_need_cuda_headers
@@ -149,7 +150,7 @@ def test_error_timing_incomplete():
     program_options = ProgramOptions(
         std="c++17",
         arch=f"sm_{arch}",
-        include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
+        include_path=helpers.CCCL_INCLUDE_PATHS,
     )
     prog = Program(code, code_type="c++", options=program_options)
     mod = prog.compile(target_type="cubin")

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ctypes
-import os
-import pathlib
+
+import helpers
 
 try:
     import cupy as cp
@@ -107,7 +107,7 @@ def test_launch_invalid_values(init_cuda):
     (ctypes.c_float, "float", 3.14),
     (ctypes.c_double, "double", 2.718),
 )
-if os.environ.get("CUDA_PATH"):
+if helpers.CCCL_INCLUDE_PATHS is not None:
     PARAMS += (
         (np.float16, "half", 0.78),
         (np.complex64, "cuda::std::complex<float>", 1 + 2j),
@@ -141,18 +141,15 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
 
     # Compile and force instantiation for this type
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    if os.environ.get("CUDA_PATH"):
-        include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
+    if helpers.CCCL_INCLUDE_PATHS is not None:
         code = (
             r"""
         #include <cuda_fp16.h>
         #include <cuda/std/complex>
         """
             + code
         )
-    else:
-        include_path = None
-    pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path)
+    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
     prog = Program(code, code_type="c++", options=pro_opts)
     ker_name = f"write_scalar<{cpp_type}>"
     mod = prog.compile("cubin", name_expressions=(ker_name,))
@@ -186,8 +183,7 @@ def test_cooperative_launch():
 
     # Compile and force instantiation for this type
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
-    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path)
+    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
     prog = Program(code, code_type="c++", options=pro_opts)
     ker = prog.compile("cubin").get_kernel("test_grid_sync")