Skip to content

cuda_core forward compatibility changes. #722

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions cuda_core/cuda/core/experimental/_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi
default_value = 0
flags = 0

status, _, graph, _, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: If we're packing anyway, maybe we pack them together?

Suggested change
status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
status, _, graph, *_ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))

if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
raise RuntimeError("Cannot create a conditional handle when graph is not being built")

Expand All @@ -486,20 +486,22 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi

def _cond_with_params(self, node_params) -> GraphBuilder:
# Get current capture info to ensure we're in a valid state
status, _, graph, dependencies, num_dependencies = handle_return(
status, _, graph, *deps_info, num_dependencies = handle_return(
driver.cuStreamGetCaptureInfo(self._mnff.stream.handle)
)
if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
raise RuntimeError("Cannot add conditional node when not actively capturing")

# Add the conditional node to the graph
node = handle_return(driver.cuGraphAddNode(graph, dependencies, num_dependencies, node_params))
deps_info_update = [
[handle_return(driver.cuGraphAddNode(graph, *deps_info, num_dependencies, node_params))]
] + [None] * (len(deps_info) - 1)

# Update the stream's capture dependencies
handle_return(
driver.cuStreamUpdateCaptureDependencies(
self._mnff.stream.handle,
[node], # dependencies
*deps_info_update, # dependencies, edgeData
1, # numDependencies
driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
)
Expand Down Expand Up @@ -677,17 +679,23 @@ def add_child(self, child_graph: GraphBuilder):
raise ValueError("Parent graph is not being built.")

stream_handle = self._mnff.stream.handle
_, _, graph_out, dependencies_out, num_dependencies_out = handle_return(
_, _, graph_out, *deps_info_out, num_dependencies_out = handle_return(
driver.cuStreamGetCaptureInfo(stream_handle)
)

child_node = handle_return(
driver.cuGraphAddChildGraphNode(graph_out, dependencies_out, num_dependencies_out, child_graph._mnff.graph)
)
deps_info_update = [
[
handle_return(
driver.cuGraphAddChildGraphNode(
graph_out, deps_info_out[0], num_dependencies_out, child_graph._mnff.graph
)
)
]
] + [None] * (len(deps_info_out) - 1)
handle_return(
driver.cuStreamUpdateCaptureDependencies(
stream_handle,
[child_node],
*deps_info_update, # dependencies, edgeData
1,
driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
)
Expand Down
5 changes: 2 additions & 3 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright 2024 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import os
import helpers

try:
from cuda.bindings import driver
Expand Down Expand Up @@ -65,5 +65,4 @@ def pop_all_contexts():
return pop_all_contexts


# TODO: make the fixture more sophisticated using path finder
skipif_need_cuda_headers = pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need CUDA header")
skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
16 changes: 16 additions & 0 deletions cuda_core/tests/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import os

CUDA_PATH = os.environ.get("CUDA_PATH")
CUDA_INCLUDE_PATH = None
CCCL_INCLUDE_PATHS = None
if CUDA_PATH is not None:
path = os.path.join(CUDA_PATH, "include")
if os.path.isdir(path):
CUDA_INCLUDE_PATH = path
CCCL_INCLUDE_PATHS = (path,)
path = os.path.join(path, "cccl")
if os.path.isdir(path):
CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
3 changes: 2 additions & 1 deletion cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import platform
import time

import helpers
import numpy as np
import pytest
from conftest import skipif_need_cuda_headers
Expand Down Expand Up @@ -149,7 +150,7 @@ def test_error_timing_incomplete():
program_options = ProgramOptions(
std="c++17",
arch=f"sm_{arch}",
include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
include_path=helpers.CCCL_INCLUDE_PATHS,
)
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile(target_type="cubin")
Expand Down
16 changes: 6 additions & 10 deletions cuda_core/tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# SPDX-License-Identifier: Apache-2.0

import ctypes
import os
import pathlib

import helpers

try:
import cupy as cp
Expand Down Expand Up @@ -107,7 +107,7 @@ def test_launch_invalid_values(init_cuda):
(ctypes.c_float, "float", 3.14),
(ctypes.c_double, "double", 2.718),
)
if os.environ.get("CUDA_PATH"):
if helpers.CCCL_INCLUDE_PATHS is not None:
PARAMS += (
(np.float16, "half", 0.78),
(np.complex64, "cuda::std::complex<float>", 1 + 2j),
Expand Down Expand Up @@ -141,18 +141,15 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):

# Compile and force instantiation for this type
arch = "".join(f"{i}" for i in dev.compute_capability)
if os.environ.get("CUDA_PATH"):
include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
if helpers.CCCL_INCLUDE_PATHS is not None:
code = (
r"""
#include <cuda_fp16.h>
#include <cuda/std/complex>
"""
+ code
)
else:
include_path = None
pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path)
pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
prog = Program(code, code_type="c++", options=pro_opts)
ker_name = f"write_scalar<{cpp_type}>"
mod = prog.compile("cubin", name_expressions=(ker_name,))
Expand Down Expand Up @@ -186,8 +183,7 @@ def test_cooperative_launch():

# Compile and force instantiation for this type
arch = "".join(f"{i}" for i in dev.compute_capability)
include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path)
pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
prog = Program(code, code_type="c++", options=pro_opts)
ker = prog.compile("cubin").get_kernel("test_grid_sync")

Expand Down
Loading