From daab58000e5949c375dac86af8dfaef97190b8ec Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 11 Dec 2024 16:23:06 -0800
Subject: [PATCH 01/66] Add cccl/python/cuda_cccl directory and use from
 cuda_parallel, cuda_cooperative

---
 python/cuda_cccl/.gitignore                   |  2 +
 .../MANIFEST.in                               |  0
 python/cuda_cccl/README.md                    | 11 +++
 python/cuda_cccl/pyproject.toml               |  7 ++
 python/cuda_cccl/setup.py                     | 75 +++++++++++++++++++
 python/cuda_cooperative/.gitignore            |  1 -
 python/cuda_cooperative/README.md             | 13 +++-
 python/cuda_cooperative/setup.py              | 26 +------
 python/cuda_parallel/.gitignore               |  1 -
 python/cuda_parallel/MANIFEST.in              |  1 -
 python/cuda_parallel/README.md                | 13 +++-
 python/cuda_parallel/setup.py                 | 33 ++------
 12 files changed, 126 insertions(+), 57 deletions(-)
 create mode 100644 python/cuda_cccl/.gitignore
 rename python/{cuda_cooperative => cuda_cccl}/MANIFEST.in (100%)
 create mode 100644 python/cuda_cccl/README.md
 create mode 100644 python/cuda_cccl/pyproject.toml
 create mode 100644 python/cuda_cccl/setup.py
 delete mode 100644 python/cuda_parallel/MANIFEST.in

diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
new file mode 100644
index 00000000000..3beca7c8684
--- /dev/null
+++ b/python/cuda_cccl/.gitignore
@@ -0,0 +1,2 @@
+cuda/_include
+*egg-info
diff --git a/python/cuda_cooperative/MANIFEST.in b/python/cuda_cccl/MANIFEST.in
similarity index 100%
rename from python/cuda_cooperative/MANIFEST.in
rename to python/cuda_cccl/MANIFEST.in
diff --git a/python/cuda_cccl/README.md b/python/cuda_cccl/README.md
new file mode 100644
index 00000000000..89ea15b2899
--- /dev/null
+++ b/python/cuda_cccl/README.md
@@ -0,0 +1,11 @@
+# `cuda.cccl`: Experimental CUDA Core Compute Library Python module with CCCL headers
+
+## Documentation
+
+Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
+
+## Local development
+
+```bash
+pip3 install .
+```
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
new file mode 100644
index 00000000000..4ab52c80318
--- /dev/null
+++ b/python/cuda_cccl/pyproject.toml
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+[build-system]
+requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
new file mode 100644
index 00000000000..1e7bc48be6f
--- /dev/null
+++ b/python/cuda_cccl/setup.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import shutil
+
+from setuptools import Command, setup, find_namespace_packages
+from setuptools.command.build_py import build_py
+from wheel.bdist_wheel import bdist_wheel
+
+
+project_path = os.path.abspath(os.path.dirname(__file__))
+cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
+cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
+ver = "0.1.2.8.0"
+
+
+with open("README.md") as f:
+    long_description = f.read()
+
+
+class CustomBuildCommand(build_py):
+    def run(self):
+        self.run_command("package_cccl")
+        build_py.run(self)
+
+
+class CustomWheelBuild(bdist_wheel):
+    def run(self):
+        self.run_command("package_cccl")
+        super().run()
+
+
+class PackageCCCLCommand(Command):
+    description = "Generate additional files"
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        for proj_dir, header_dir in cccl_headers:
+            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
+            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
+            if os.path.exists(dst_path):
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+
+
+setup(
+    name="cuda-cccl",
+    version=ver,
+    description="Experimental Package with CCCL headers to support JIT compilation",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="NVIDIA Corporation",
+    classifiers=[
+        "Programming Language :: Python :: 3 :: Only",
+        "Environment :: GPU :: NVIDIA CUDA",
+    ],
+    packages=find_namespace_packages(include=["cuda.*"]),
+    python_requires=">=3.9",
+    cmdclass={
+        "package_cccl": PackageCCCLCommand,
+        "build_py": CustomBuildCommand,
+        "bdist_wheel": CustomWheelBuild,
+    },
+    include_package_data=True,
+    license="Apache-2.0 with LLVM exception",
+    license_files=("../../LICENSE",),
+)
diff --git a/python/cuda_cooperative/.gitignore b/python/cuda_cooperative/.gitignore
index 15c09b246c1..a9904c10554 100644
--- a/python/cuda_cooperative/.gitignore
+++ b/python/cuda_cooperative/.gitignore
@@ -1,3 +1,2 @@
-cuda/_include
 env
 *egg-info
diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index c202d1d6c17..6b505f797e3 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -6,7 +6,16 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 
 ## Local development
 
+First-time installation:
+
+```bash
+pip3 install ./cuda_cccl
+pip3 install ./cuda_cooperative[test]
+pytest -v ./cuda_cooperative/tests/
+```
+
+For faster iterative development:
+
 ```bash
-pip3 install -e .[test]
-pytest -v ./tests/
+pip3 install -e ./cuda_cooperative[test]
 ```
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index c4bbd39dd03..c5cea429a93 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -3,9 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-import shutil
 
-from setuptools import Command, setup, find_namespace_packages
+from setuptools import setup, find_namespace_packages
 from setuptools.command.build_py import build_py
 from wheel.bdist_wheel import bdist_wheel
 
@@ -27,35 +26,14 @@
 
 class CustomBuildCommand(build_py):
     def run(self):
-        self.run_command("package_cccl")
         build_py.run(self)
 
 
 class CustomWheelBuild(bdist_wheel):
     def run(self):
-        self.run_command("package_cccl")
         super().run()
 
 
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
-
-
 setup(
     name="cuda-cooperative",
     version=ver,
@@ -70,6 +48,7 @@ def run(self):
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
     install_requires=[
+        "cuda-cccl",
         "numba>=0.60.0",
         "pynvjitlink-cu12>=0.2.4",
         "cuda-python",
@@ -82,7 +61,6 @@ def run(self):
         ]
     },
     cmdclass={
-        "package_cccl": PackageCCCLCommand,
         "build_py": CustomBuildCommand,
         "bdist_wheel": CustomWheelBuild,
     },
diff --git a/python/cuda_parallel/.gitignore b/python/cuda_parallel/.gitignore
index 8e0d030ff6a..7fc9da1604e 100644
--- a/python/cuda_parallel/.gitignore
+++ b/python/cuda_parallel/.gitignore
@@ -1,4 +1,3 @@
-cuda/_include
 env
 *egg-info
 *so
diff --git a/python/cuda_parallel/MANIFEST.in b/python/cuda_parallel/MANIFEST.in
deleted file mode 100644
index 848cbfe2e81..00000000000
--- a/python/cuda_parallel/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/_include *
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index 98a3a3c92d0..02710b50053 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -6,7 +6,16 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 
 ## Local development
 
+First-time installation:
+
+```bash
+pip3 install ./cuda_cccl
+pip3 install ./cuda_parallel[test]
+pytest -v ./cuda_parallel/tests/
+```
+
+For faster iterative development:
+
 ```bash
-pip3 install -e .[test]
-pytest -v ./tests/
+pip3 install -e ./cuda_parallel[test]
 ```
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 40c998fafee..f3d74fa2c10 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -3,10 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-import shutil
 import subprocess
 
-from setuptools import Command, Extension, setup, find_namespace_packages
+from setuptools import Extension, setup, find_namespace_packages
 from setuptools.command.build_py import build_py
 from setuptools.command.build_ext import build_ext
 from wheel.bdist_wheel import bdist_wheel
@@ -29,36 +28,14 @@
 
 class CustomBuildCommand(build_py):
     def run(self):
-        self.run_command("package_cccl")
         build_py.run(self)
 
 
 class CustomWheelBuild(bdist_wheel):
     def run(self):
-        self.run_command("package_cccl")
         super().run()
 
 
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            # TODO Extract cccl headers into a standalone package
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
-
-
 class CMakeExtension(Extension):
     def __init__(self, name):
         super().__init__(name, sources=[])
@@ -100,7 +77,12 @@ def build_extension(self, ext):
     ],
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
-    install_requires=["numba>=0.60.0", "cuda-python", "jinja2"],
+    install_requires=[
+        "cuda-cccl",
+        "numba>=0.60.0",
+        "cuda-python",
+        "jinja2",
+    ],
     extras_require={
         "test": [
             "pytest",
@@ -109,7 +91,6 @@ def build_extension(self, ext):
         ]
     },
     cmdclass={
-        "package_cccl": PackageCCCLCommand,
         "build_py": CustomBuildCommand,
         "bdist_wheel": CustomWheelBuild,
         "build_ext": BuildCMakeExtension,

From ef9d5f4712c635d087e65e5679863fde8d3caf8a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 16:06:32 -0800
Subject: [PATCH 02/66] Run `copy_cccl_headers_to_aude_include()` before
 `setup()`

---
 python/cuda_cccl/setup.py | 45 ++++++++-------------------------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 1e7bc48be6f..4a816ac67b9 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -5,9 +5,7 @@
 import os
 import shutil
 
-from setuptools import Command, setup, find_namespace_packages
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
+from setuptools import setup, find_namespace_packages
 
 
 project_path = os.path.abspath(os.path.dirname(__file__))
@@ -20,36 +18,16 @@
     long_description = f.read()
 
 
-class CustomBuildCommand(build_py):
-    def run(self):
-        self.run_command("package_cccl")
-        build_py.run(self)
+def copy_cccl_headers_to_cuda_include():
+    for proj_dir, header_dir in cccl_headers:
+        src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
+        dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
+        if os.path.exists(dst_path):
+            shutil.rmtree(dst_path)
+        shutil.copytree(src_path, dst_path)
 
 
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        self.run_command("package_cccl")
-        super().run()
-
-
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
-
+copy_cccl_headers_to_cuda_include()
 
 setup(
     name="cuda-cccl",
@@ -64,11 +42,6 @@ def run(self):
     ],
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
-    cmdclass={
-        "package_cccl": PackageCCCLCommand,
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
-    },
     include_package_data=True,
     license="Apache-2.0 with LLVM exception",
     license_files=("../../LICENSE",),

From bc116dc27d94f3e84d90c6fdea23265ea88916b5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 16:54:48 -0800
Subject: [PATCH 03/66] Create python/cuda_cccl/cuda/_include/__init__.py, then
 simply import cuda._include to find the include path.

---
 python/cuda_cccl/setup.py                                   | 6 +++++-
 .../cuda/cooperative/experimental/_nvrtc.py                 | 4 ----
 .../cuda_parallel/cuda/parallel/experimental/_bindings.py   | 5 ++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 4a816ac67b9..5b2688779e0 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -19,9 +19,13 @@
 
 
 def copy_cccl_headers_to_cuda_include():
+    inc_path = os.path.join(project_path, "cuda", "_include")
+    init_py_path = os.path.join(inc_path, "__init__.py")
+    with open(init_py_path, "w") as f:
+        print("# Intentionally empty.", file=f)
     for proj_dir, header_dir in cccl_headers:
         src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-        dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
+        dst_path = os.path.join(inc_path, proj_dir)
         if os.path.exists(dst_path):
             shutil.rmtree(dst_path)
         shutil.copytree(src_path, dst_path)
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 25de1119b6e..46038ebd632 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -47,10 +47,6 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("code", code, ["lto", "ptx"])
 
     with pkg_resources.path("cuda", "_include") as include_path:
-        # Using `.parent` for compatibility with pip install --editable:
-        include_path = pkg_resources.files("cuda.cooperative").parent.joinpath(
-            "_include"
-        )
         cub_path = include_path
         thrust_path = include_path
         libcudacxx_path = os.path.join(include_path, "libcudacxx")
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index a4ad84a2b42..771e5dd1666 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -52,9 +52,8 @@ def get_bindings() -> ctypes.CDLL:
 
 @lru_cache()
 def get_paths() -> List[bytes]:
-    with as_file(files("cuda.parallel")) as f:
-        # Using `.parent` for compatibility with pip install --editable:
-        cub_include_path = str(f.parent / "_include")
+    with as_file(files("cuda._include")) as f:
+        cub_include_path = str(f)
     thrust_include_path = cub_include_path
     libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
     cuda_include_path = None

From 2913ae07557493c407b14bdfb68495f8a23e246a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 17:18:30 -0800
Subject: [PATCH 04/66] Add cuda.cccl._version exactly as for cuda.cooperative
 and cuda.parallel

---
 ci/test_python.sh                      | 12 +++++++-----
 ci/update_version.sh                   |  2 ++
 python/cuda_cccl/cuda/cccl/_version.py |  7 +++++++
 python/cuda_cccl/setup.py              |  7 ++++++-
 4 files changed, 22 insertions(+), 6 deletions(-)
 create mode 100644 python/cuda_cccl/cuda/cccl/_version.py

diff --git a/ci/test_python.sh b/ci/test_python.sh
index bd66cc57716..89559712069 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -11,6 +11,12 @@ fail_if_no_gpu
 readonly prefix="${BUILD_DIR}/python/"
 export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
 
+pushd ../python/cuda_cccl >/dev/null
+
+run_command "⚙️  Pip install cuda_cccl" pip install --force-reinstall --upgrade --target "${prefix}" .
+
+popd >/dev/null
+
 pushd ../python/cuda_cooperative >/dev/null
 
 run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
@@ -20,11 +26,7 @@ popd >/dev/null
 
 pushd ../python/cuda_parallel >/dev/null
 
-# Temporarily install the package twice to populate include directory as part of the first installation
-# and to let manifest discover these includes during the second installation. Do not forget to remove the
-# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed.
-run_command "⚙️  Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "⚙️  Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "⚙️  Pip install cuda_parallel" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
 run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
 
 popd >/dev/null
diff --git a/ci/update_version.sh b/ci/update_version.sh
index c43303449bb..81573f0541e 100755
--- a/ci/update_version.sh
+++ b/ci/update_version.sh
@@ -37,6 +37,7 @@ CUB_CMAKE_VERSION_FILE="lib/cmake/cub/cub-config-version.cmake"
 LIBCUDACXX_CMAKE_VERSION_FILE="lib/cmake/libcudacxx/libcudacxx-config-version.cmake"
 THRUST_CMAKE_VERSION_FILE="lib/cmake/thrust/thrust-config-version.cmake"
 CUDAX_CMAKE_VERSION_FILE="lib/cmake/cudax/cudax-config-version.cmake"
+CUDA_CCCL_VERSION_FILE="python/cuda_cccl/cuda/cccl/_version.py"
 CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py"
 CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py"
 
@@ -110,6 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
+update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 
diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py
new file mode 100644
index 00000000000..63cedf944ad
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/_version.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This file is generated by ci/update_version.sh
+# Do not edit this file manually.
+__version__ = "0.1.2.8.0"
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 5b2688779e0..e146656c2f7 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -11,7 +11,12 @@
 project_path = os.path.abspath(os.path.dirname(__file__))
 cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
 cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-ver = "0.1.2.8.0"
+__version__ = None
+with open(os.path.join(project_path, "cuda", "cccl", "_version.py")) as f:
+    exec(f.read())
+assert __version__ is not None
+ver = __version__
+del __version__
 
 
 with open("README.md") as f:

From 7dbb82b9463afd4bea3bdac323f758f33735400f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 17:35:27 -0800
Subject: [PATCH 05/66] Bug fix: cuda/_include only exists after
 shutil.copytree() ran.

---
 python/cuda_cccl/setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index e146656c2f7..b153731232b 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -25,15 +25,15 @@
 
 def copy_cccl_headers_to_cuda_include():
     inc_path = os.path.join(project_path, "cuda", "_include")
-    init_py_path = os.path.join(inc_path, "__init__.py")
-    with open(init_py_path, "w") as f:
-        print("# Intentionally empty.", file=f)
     for proj_dir, header_dir in cccl_headers:
         src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
         dst_path = os.path.join(inc_path, proj_dir)
         if os.path.exists(dst_path):
             shutil.rmtree(dst_path)
         shutil.copytree(src_path, dst_path)
+    init_py_path = os.path.join(inc_path, "__init__.py")
+    with open(init_py_path, "w") as f:
+        print("# Intentionally empty.", file=f)
 
 
 copy_cccl_headers_to_cuda_include()

From 0703901ef409336d2d84954da0aa8d87222371b9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 17:45:39 -0800
Subject: [PATCH 06/66] Use `f"cuda-cccl @
 file://{cccl_path}/python/cuda_cccl"` in setup.py

---
 python/cuda_cooperative/setup.py | 2 +-
 python/cuda_parallel/setup.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index c5cea429a93..7f2207de5d0 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -48,7 +48,7 @@ def run(self):
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
     install_requires=[
-        "cuda-cccl",
+        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
         "numba>=0.60.0",
         "pynvjitlink-cu12>=0.2.4",
         "cuda-python",
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index f3d74fa2c10..2b8908308af 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -78,7 +78,7 @@ def build_extension(self, ext):
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
     install_requires=[
-        "cuda-cccl",
+        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
         "numba>=0.60.0",
         "cuda-python",
         "jinja2",

From fc0e5435f68bd30c01e2e0558df5edde308e3a3f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 17:50:34 -0800
Subject: [PATCH 07/66] Remove CustomBuildCommand, CustomWheelBuild in
 cuda_parallel/setup.py (they are equivalent to the default functions)

---
 python/cuda_parallel/setup.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 2b8908308af..1cca7f1db8a 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -6,9 +6,7 @@
 import subprocess
 
 from setuptools import Extension, setup, find_namespace_packages
-from setuptools.command.build_py import build_py
 from setuptools.command.build_ext import build_ext
-from wheel.bdist_wheel import bdist_wheel
 
 
 project_path = os.path.abspath(os.path.dirname(__file__))
@@ -26,16 +24,6 @@
     long_description = f.read()
 
 
-class CustomBuildCommand(build_py):
-    def run(self):
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        super().run()
-
-
 class CMakeExtension(Extension):
     def __init__(self, name):
         super().__init__(name, sources=[])
@@ -91,8 +79,6 @@ def build_extension(self, ext):
         ]
     },
     cmdclass={
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
         "build_ext": BuildCMakeExtension,
     },
     ext_modules=[CMakeExtension("cuda.parallel.experimental.cccl.c")],

From 2e6434577cab7f30da84782df54989874e024ce7 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 19 Dec 2024 19:13:20 -0800
Subject: [PATCH 08/66] Replace := operator (needs Python 3.8+)

---
 python/cuda_parallel/cuda/parallel/experimental/_bindings.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index 771e5dd1666..9de0bd7d8d9 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -57,7 +57,8 @@ def get_paths() -> List[bytes]:
     thrust_include_path = cub_include_path
     libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
     cuda_include_path = None
-    if cuda_path := _get_cuda_path():
+    cuda_path = _get_cuda_path()
+    if cuda_path:
         cuda_include_path = str(os.path.join(cuda_path, "include"))
     paths = [
         f"-I{path}".encode()

From f13a96ba41c7db4beedf0ed9bedf3de20d05813a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 13:21:00 -0800
Subject: [PATCH 09/66] Fix oversights: remove `pip3 install ./cuda_cccl` lines
 from README.md

---
 python/cuda_cooperative/README.md | 1 -
 python/cuda_parallel/README.md    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index 6b505f797e3..02da7a205e0 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -9,7 +9,6 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 First-time installation:
 
 ```bash
-pip3 install ./cuda_cccl
 pip3 install ./cuda_cooperative[test]
 pytest -v ./cuda_cooperative/tests/
 ```
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index 02710b50053..d028e34eb2a 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -9,7 +9,6 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 First-time installation:
 
 ```bash
-pip3 install ./cuda_cccl
 pip3 install ./cuda_parallel[test]
 pytest -v ./cuda_parallel/tests/
 ```

From 9ed6036bde6a1e08d44835b88a8194825e48cc3c Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 13:36:07 -0800
Subject: [PATCH 10/66] Restore original README.md: `pip3 install -e` now works
 on first pass.

---
 python/cuda_cooperative/README.md | 12 ++----------
 python/cuda_parallel/README.md    | 12 ++----------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index 02da7a205e0..c202d1d6c17 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -6,15 +6,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 
 ## Local development
 
-First-time installation:
-
-```bash
-pip3 install ./cuda_cooperative[test]
-pytest -v ./cuda_cooperative/tests/
-```
-
-For faster iterative development:
-
 ```bash
-pip3 install -e ./cuda_cooperative[test]
+pip3 install -e .[test]
+pytest -v ./tests/
 ```
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index d028e34eb2a..98a3a3c92d0 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -6,15 +6,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 
 ## Local development
 
-First-time installation:
-
-```bash
-pip3 install ./cuda_parallel[test]
-pytest -v ./cuda_parallel/tests/
-```
-
-For faster iterative development:
-
 ```bash
-pip3 install -e ./cuda_parallel[test]
+pip3 install -e .[test]
+pytest -v ./tests/
 ```

From c9a4d9676bb245714321d9c679cad773e67fe88a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 14:20:21 -0800
Subject: [PATCH 11/66] cuda_cccl/README.md: FOR INTERNAL USE ONLY

---
 python/cuda_cccl/README.md | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/python/cuda_cccl/README.md b/python/cuda_cccl/README.md
index 89ea15b2899..37f020b6df6 100644
--- a/python/cuda_cccl/README.md
+++ b/python/cuda_cccl/README.md
@@ -1,11 +1,3 @@
-# `cuda.cccl`: Experimental CUDA Core Compute Library Python module with CCCL headers
+## Note
 
-## Documentation
-
-Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
-
-## Local development
-
-```bash
-pip3 install .
-```
+This package is currently FOR INTERNAL USE ONLY and not meant to be used/installed explicitly.

From df943c05bc938a5901090255ed9b02104c9f7c86 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 14:29:07 -0800
Subject: [PATCH 12/66] Remove `$pymajor.$pyminor.` prefix in cuda_cccl
 _version.py (as suggested under
 https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894035917)

Command used: ci/update_version.sh 2 8 0
---
 ci/update_version.sh                   | 2 +-
 python/cuda_cccl/cuda/cccl/_version.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/update_version.sh b/ci/update_version.sh
index 81573f0541e..6a25a837d50 100755
--- a/ci/update_version.sh
+++ b/ci/update_version.sh
@@ -111,7 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
-update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
+update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\""
 update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 
diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py
index 63cedf944ad..f9961cad366 100644
--- a/python/cuda_cccl/cuda/cccl/_version.py
+++ b/python/cuda_cccl/cuda/cccl/_version.py
@@ -4,4 +4,4 @@
 
 # This file is generated by ci/update_version.sh
 # Do not edit this file manually.
-__version__ = "0.1.2.8.0"
+__version__ = "2.8.0"

From 40c83898e29cd76c1d240e998d82c71a43dbc0dc Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 16:00:46 -0800
Subject: [PATCH 13/66] Modernize pyproject.toml, setup.py

Trigger for this change:

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894043178

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894044996
---
 python/cuda_cccl/.gitignore     |  1 +
 python/cuda_cccl/pyproject.toml | 37 +++++++++++++++++++++++++++-
 python/cuda_cccl/setup.py       | 43 ++++++++++-----------------------
 3 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
index 3beca7c8684..cab32df5b58 100644
--- a/python/cuda_cccl/.gitignore
+++ b/python/cuda_cccl/.gitignore
@@ -1,2 +1,3 @@
+LICENSE
 cuda/_include
 *egg-info
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 4ab52c80318..a60224f0975 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -3,5 +3,40 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = [
+    "setuptools>=61.0.0",
+    "wheel",
+    "packaging",
+]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "cuda-cccl"
+description = "Experimental Package with CCCL headers to support JIT compilation"
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3 :: Only",
+    "Environment :: GPU :: NVIDIA CUDA",
+    "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dynamic = ["version", "readme"]
+
+[project.urls]
+Homepage = "https://github.com/NVIDIA/cccl"
+Documentation = "https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl"
+Source = "https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl"
+Tracker = "https://github.com/NVIDIA/cccl/issues"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cccl._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
+[tool.setuptools.package-data]
+"cuda" = ["_include/**/*"]
+
+[tool.setuptools.exclude-package-data]
+"cuda" = ["_include/__init__.py"]
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index b153731232b..02a83acf2e5 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -2,56 +2,39 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from setuptools import setup, find_namespace_packages
 import os
 import shutil
 
-from setuptools import setup, find_namespace_packages
-
-
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "cccl", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
+PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
+CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
 
 
-with open("README.md") as f:
-    long_description = f.read()
+def copy_license():
+    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
+    dst = os.path.join(PROJECT_PATH, "LICENSE")
+    shutil.copy(src, dst)
 
 
 def copy_cccl_headers_to_cuda_include():
-    inc_path = os.path.join(project_path, "cuda", "_include")
+    cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
+    inc_path = os.path.join(PROJECT_PATH, "cuda", "_include")
+    os.makedirs(inc_path, exist_ok=True)
     for proj_dir, header_dir in cccl_headers:
-        src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
+        src_path = os.path.abspath(os.path.join(CCCL_PATH, proj_dir, header_dir))
         dst_path = os.path.join(inc_path, proj_dir)
         if os.path.exists(dst_path):
             shutil.rmtree(dst_path)
         shutil.copytree(src_path, dst_path)
     init_py_path = os.path.join(inc_path, "__init__.py")
     with open(init_py_path, "w") as f:
-        print("# Intentionally empty.", file=f)
+        f.write("# Intentionally empty.\n")
 
 
+copy_license()
 copy_cccl_headers_to_cuda_include()
 
 setup(
-    name="cuda-cccl",
-    version=ver,
-    description="Experimental Package with CCCL headers to support JIT compilation",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
     packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
     include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
 )

From e3c78671258f9c41f2b0614e2566d20904ef94bb Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 20 Dec 2024 17:01:01 -0800
Subject: [PATCH 14/66] Install CCCL headers under cuda.cccl.include

Trigger for this change:

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894048562

Unexpected accidental discovery: cuda.cooperative unit tests pass without CCCL headers entirely.
---
 python/cuda_cccl/.gitignore                            |  2 +-
 python/cuda_cccl/MANIFEST.in                           |  2 +-
 python/cuda_cccl/pyproject.toml                        |  4 ++--
 python/cuda_cccl/setup.py                              |  6 +++---
 .../cuda/cooperative/experimental/_nvrtc.py            | 10 +---------
 python/cuda_cooperative/setup.py                       |  1 -
 .../cuda/parallel/experimental/_bindings.py            |  2 +-
 7 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
index cab32df5b58..8d624328cd6 100644
--- a/python/cuda_cccl/.gitignore
+++ b/python/cuda_cccl/.gitignore
@@ -1,3 +1,3 @@
 LICENSE
-cuda/_include
+cuda/cccl/include
 *egg-info
diff --git a/python/cuda_cccl/MANIFEST.in b/python/cuda_cccl/MANIFEST.in
index 848cbfe2e81..55d6b5f63ba 100644
--- a/python/cuda_cccl/MANIFEST.in
+++ b/python/cuda_cccl/MANIFEST.in
@@ -1 +1 @@
-recursive-include cuda/_include *
+recursive-include cuda/cccl/include *
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index a60224f0975..2e7afa9499e 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -36,7 +36,7 @@ version = { attr = "cuda.cccl._version.__version__" }
 readme = { file = ["README.md"], content-type = "text/markdown" }
 
 [tool.setuptools.package-data]
-"cuda" = ["_include/**/*"]
+"cuda" = ["cccl/include/**/*"]
 
 [tool.setuptools.exclude-package-data]
-"cuda" = ["_include/__init__.py"]
+"cuda" = ["cccl/include/__init__.py"]
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 02a83acf2e5..0fdf73e7f1f 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -16,9 +16,9 @@ def copy_license():
     shutil.copy(src, dst)
 
 
-def copy_cccl_headers_to_cuda_include():
+def copy_cccl_headers_to_cuda_cccl_include():
     cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-    inc_path = os.path.join(PROJECT_PATH, "cuda", "_include")
+    inc_path = os.path.join(PROJECT_PATH, "cuda", "cccl", "include")
     os.makedirs(inc_path, exist_ok=True)
     for proj_dir, header_dir in cccl_headers:
         src_path = os.path.abspath(os.path.join(CCCL_PATH, proj_dir, header_dir))
@@ -32,7 +32,7 @@ def copy_cccl_headers_to_cuda_include():
 
 
 copy_license()
-copy_cccl_headers_to_cuda_include()
+copy_cccl_headers_to_cuda_cccl_include()
 
 setup(
     packages=find_namespace_packages(include=["cuda.*"]),
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 46038ebd632..f3aed13b399 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -7,7 +7,6 @@
 from cuda.bindings import nvrtc
 from cuda.cooperative.experimental._caching import disk_cache
 from cuda.cooperative.experimental._common import check_in, version
-import importlib.resources as pkg_resources
 import functools
 
 
@@ -46,17 +45,10 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("rdc", rdc, [True, False])
     check_in("code", code, ["lto", "ptx"])
 
-    with pkg_resources.path("cuda", "_include") as include_path:
-        cub_path = include_path
-        thrust_path = include_path
-        libcudacxx_path = os.path.join(include_path, "libcudacxx")
-        cuda_include_path = os.path.join(get_cuda_path(), "include")
+    cuda_include_path = os.path.join(get_cuda_path(), "include")
 
     opts = [
         b"--std=c++17",
-        bytes(f"--include-path={cub_path}", encoding="ascii"),
-        bytes(f"--include-path={thrust_path}", encoding="ascii"),
-        bytes(f"--include-path={libcudacxx_path}", encoding="ascii"),
         bytes(f"--include-path={cuda_include_path}", encoding="ascii"),
         bytes(f"--gpu-architecture=compute_{cc}", encoding="ascii"),
     ]
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 7f2207de5d0..6cac6e03b30 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -48,7 +48,6 @@ def run(self):
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
     install_requires=[
-        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
         "numba>=0.60.0",
         "pynvjitlink-cu12>=0.2.4",
         "cuda-python",
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index d3afdc6dea4..3e9bf308805 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -59,7 +59,7 @@ def get_paths() -> List[bytes]:
     # can move this to a module-level import.
     from importlib.resources import as_file, files
 
-    with as_file(files("cuda._include")) as f:
+    with as_file(files("cuda.cccl.include")) as f:
         cub_include_path = str(f)
     thrust_include_path = cub_include_path
     libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))

From 06f575fc16c422e3ffdf8f2c2a13bb7fe09d4465 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 08:52:21 -0800
Subject: [PATCH 15/66] Factor out cuda_cccl/cuda/cccl/include_paths.py

---
 python/cuda_cccl/cuda/cccl/include_paths.py   | 56 +++++++++++++++++++
 .../cuda/parallel/experimental/_bindings.py   | 41 ++------------
 2 files changed, 60 insertions(+), 37 deletions(-)
 create mode 100644 python/cuda_cccl/cuda/cccl/include_paths.py

diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
new file mode 100644
index 00000000000..58ae09ab0a3
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -0,0 +1,56 @@
+from dataclasses import dataclass
+from functools import lru_cache
+import os
+import shutil
+from typing import Optional
+
+
+def _get_cuda_path() -> Optional[str]:
+    cuda_path = os.environ.get("CUDA_PATH")
+    if cuda_path and os.path.exists(cuda_path):
+        return cuda_path
+
+    nvcc_path = shutil.which("nvcc")
+    if nvcc_path is not None:
+        return os.path.dirname(os.path.dirname(nvcc_path))
+
+    default_path = "/usr/local/cuda"
+    if os.path.exists(default_path):
+        return default_path
+
+    return None
+
+
+@dataclass
+class IncludePaths:
+    cuda: Optional[str]
+    libcudacxx: Optional[str]
+    cub: Optional[str]
+    thrust: Optional[str]
+
+    def as_tuple(self):
+        # Note: higher-level ... lower-level order:
+        return (self.thrust, self.cub, self.libcudacxx, self.cuda)
+
+
+@lru_cache()
+def get_include_paths() -> IncludePaths:
+    # TODO: once docs env supports Python >= 3.9, we
+    # can move this to a module-level import.
+    from importlib.resources import as_file, files
+
+    cuda_incl = None
+    cuda_path = _get_cuda_path()
+    if cuda_path is not None:
+        cuda_incl = os.path.join(cuda_path, "include")
+
+    with as_file(files("cuda.cccl.include")) as f:
+        cccl_incl = str(f)
+    assert os.path.exists(cccl_incl)
+
+    return IncludePaths(
+        cuda=cuda_incl,
+        libcudacxx=os.path.join(cccl_incl, "libcudacxx"),
+        cub=cccl_incl,
+        thrust=cccl_incl,
+    )
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index 3e9bf308805..0585fca9e71 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -3,29 +3,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
 import ctypes
 from functools import lru_cache
-from typing import List, Optional
+from typing import List
 
-from . import _cccl as cccl
-
-
-def _get_cuda_path() -> Optional[str]:
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
+from cuda.cccl.include_paths import get_include_paths  # type: ignore[import-not-found]
 
-    return None
+from . import _cccl as cccl
 
 
 @lru_cache()
@@ -55,26 +39,9 @@ def get_bindings() -> ctypes.CDLL:
 
 @lru_cache()
 def get_paths() -> List[bytes]:
-    # TODO: once docs env supports Python >= 3.9, we
-    # can move this to a module-level import.
-    from importlib.resources import as_file, files
-
-    with as_file(files("cuda.cccl.include")) as f:
-        cub_include_path = str(f)
-    thrust_include_path = cub_include_path
-    libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
-    cuda_include_path = None
-    cuda_path = _get_cuda_path()
-    if cuda_path is not None:
-        cuda_include_path = str(os.path.join(cuda_path, "include"))
     paths = [
         f"-I{path}".encode()
-        for path in (
-            cub_include_path,
-            thrust_include_path,
-            libcudacxx_include_path,
-            cuda_include_path,
-        )
+        for path in get_include_paths().as_tuple()
         if path is not None
     ]
     return paths

From e7477685ba0f33d083edbd4252f3752f8505000d Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 09:11:18 -0800
Subject: [PATCH 16/66] Reuse cuda_cccl/cuda/cccl/include_paths.py from
 cuda_cooperative

---
 .../cuda/cooperative/experimental/_nvrtc.py   | 31 ++++---------------
 python/cuda_cooperative/setup.py              |  1 +
 2 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index f3aed13b399..d6205d83254 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -2,9 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
 from cuda.bindings import nvrtc
+from cuda.cccl.include_paths import get_include_paths  # type: ignore[import-not-found]
 from cuda.cooperative.experimental._caching import disk_cache
 from cuda.cooperative.experimental._common import check_in, version
 import functools
@@ -18,22 +17,6 @@ def CHECK_NVRTC(err, prog):
         raise RuntimeError(f"NVRTC error: {log.decode('ascii')}")
 
 
-def get_cuda_path():
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
-
-    return None
-
-
 # cpp is the C++ source code
 # cc = 800 for Ampere, 900 Hopper, etc
 # rdc is true or false
@@ -45,13 +28,11 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("rdc", rdc, [True, False])
     check_in("code", code, ["lto", "ptx"])
 
-    cuda_include_path = os.path.join(get_cuda_path(), "include")
-
-    opts = [
-        b"--std=c++17",
-        bytes(f"--include-path={cuda_include_path}", encoding="ascii"),
-        bytes(f"--gpu-architecture=compute_{cc}", encoding="ascii"),
-    ]
+    opts = [b"--std=c++17"]
+    for path in get_include_paths().as_tuple():
+        if path:
+            opts += [f"--include-path={path}".encode("ascii")]
+    opts += [f"--gpu-architecture=compute_{cc}".encode("ascii")]
     if rdc:
         opts += [b"--relocatable-device-code=true"]
 
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 6cac6e03b30..7f2207de5d0 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -48,6 +48,7 @@ def run(self):
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
     install_requires=[
+        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
         "numba>=0.60.0",
         "pynvjitlink-cu12>=0.2.4",
         "cuda-python",

From 62ce2d3d1761d894e27fba4f6c28af0b46877c6f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 11:27:18 -0800
Subject: [PATCH 17/66] Add missing Copyright notice.

---
 python/cuda_cccl/cuda/cccl/include_paths.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
index 58ae09ab0a3..3f1da5ef993 100644
--- a/python/cuda_cccl/cuda/cccl/include_paths.py
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 from dataclasses import dataclass
 from functools import lru_cache
 import os

From 65c5a150d6c1e6151876bd57ec816c361acd0b91 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 11:37:44 -0800
Subject: [PATCH 18/66] Add missing __init__.py (cuda.cccl)

---
 python/cuda_cccl/cuda/cccl/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 python/cuda_cccl/cuda/cccl/__init__.py

diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py
new file mode 100644
index 00000000000..977ba51caec
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cccl._version import __version__
+
+__all__ = ["__version__"]

From bffece6aad9efca5af106241cdf5bd56fbfb6eff Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 13:20:02 -0800
Subject: [PATCH 19/66] Add `"cuda.cccl"` to `autodoc.mock_imports`

---
 docs/repo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/repo.toml b/docs/repo.toml
index 5d6f72695ac..407211f6a37 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -347,6 +347,7 @@ autodoc.mock_imports = [
     "numba",
     "pynvjitlink",
     "cuda.bindings",
+    "cuda.cccl",
     "llvmlite",
     "numpy",
 ]

From 585447cb193609e633951cd3929933c53e56c033 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 16:15:52 -0800
Subject: [PATCH 20/66] Move cuda.cccl.include_paths into function where it is
 used. (Attempt to resolve Build and Verify Docs failure.)

---
 .../cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index d6205d83254..5a2002806be 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from cuda.bindings import nvrtc
-from cuda.cccl.include_paths import get_include_paths  # type: ignore[import-not-found]
 from cuda.cooperative.experimental._caching import disk_cache
 from cuda.cooperative.experimental._common import check_in, version
 import functools
@@ -29,6 +28,9 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("code", code, ["lto", "ptx"])
 
     opts = [b"--std=c++17"]
+
+    from cuda.cccl.include_paths import get_include_paths
+
     for path in get_include_paths().as_tuple():
         if path:
             opts += [f"--include-path={path}".encode("ascii")]

From 55c431164713400680dbcb9a94ce73e14df7dc13 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Dec 2024 18:26:11 -0800
Subject: [PATCH 21/66] Add # TODO: move this to a module-level import

---
 python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 5a2002806be..7cae5c30d39 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -29,6 +29,7 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
 
     opts = [b"--std=c++17"]
 
+    # TODO: move this to a module-level import (after docs env modernization).
     from cuda.cccl.include_paths import get_include_paths
 
     for path in get_include_paths().as_tuple():

From 1f3a0291dbe13c984961403218a0e22243b773de Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 25 Dec 2024 20:43:17 -0800
Subject: [PATCH 22/66] Modernize cuda_cooperative/pyproject.toml, setup.py

---
 python/cuda_cooperative/pyproject.toml | 41 ++++++++++++++-
 python/cuda_cooperative/setup.py       | 69 ++++----------------------
 2 files changed, 49 insertions(+), 61 deletions(-)

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 4ab52c80318..1bdc12b2e7d 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -3,5 +3,44 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = [
+    "setuptools>=61.0.0",
+    "wheel",
+    "packaging",
+    "numpy",
+]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "cuda-cooperative"
+description = "Experimental Core Library for CUDA Python"
+requires-python = ">=3.9"
+dynamic = ["version", "readme"]
+authors = [
+    { name = "NVIDIA Corporation" }
+]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3 :: Only",
+    "Environment :: GPU :: NVIDIA CUDA",
+]
+dependencies = [
+    # "cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl",
+    "numba>=0.60.0",
+    "pynvjitlink-cu12>=0.2.4",
+    "cuda-python",
+    "jinja2",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "pytest-xdist",
+]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cooperative._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 7f2207de5d0..d084d296287 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -3,68 +3,17 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
+from setuptools import setup
+import shutil
 
-from setuptools import setup, find_namespace_packages
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
+PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
+CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
 
 
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "cooperative", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
+def copy_license():
+    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
+    dst = os.path.join(PROJECT_PATH, "LICENSE")
+    shutil.copy(src, dst)
 
 
-with open("README.md") as f:
-    long_description = f.read()
-
-
-class CustomBuildCommand(build_py):
-    def run(self):
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        super().run()
-
-
-setup(
-    name="cuda-cooperative",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
-    install_requires=[
-        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
-        "numba>=0.60.0",
-        "pynvjitlink-cu12>=0.2.4",
-        "cuda-python",
-        "jinja2",
-    ],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-        ]
-    },
-    cmdclass={
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
-    },
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
-)
+setup()

From 61637d608da06fcf6851ef6197f88b5e7dbc3bbe Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 25 Dec 2024 21:30:27 -0800
Subject: [PATCH 23/66] Convert cuda_cooperative to use hatchling as build
 backend.

---
 python/cuda_cooperative/pyproject.toml | 27 ++++++++++++++------------
 python/cuda_cooperative/setup.py       | 19 ------------------
 2 files changed, 15 insertions(+), 31 deletions(-)
 delete mode 100644 python/cuda_cooperative/setup.py

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 1bdc12b2e7d..a03ee1ba783 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -3,13 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = [
-    "setuptools>=61.0.0",
-    "wheel",
-    "packaging",
-    "numpy",
-]
-build-backend = "setuptools.build_meta"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 
 [project]
 name = "cuda-cooperative"
@@ -19,13 +14,14 @@ dynamic = ["version", "readme"]
 authors = [
     { name = "NVIDIA Corporation" }
 ]
-license = { file = "LICENSE" }
+license = { file = "../../LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    # "cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl",
+    "cuda-cccl @ {root:uri}/../cuda_cccl",
+    "numpy",
     "numba>=0.60.0",
     "pynvjitlink-cu12>=0.2.4",
     "cuda-python",
@@ -41,6 +37,13 @@ test = [
 [project.urls]
 Homepage = "https://developer.nvidia.com/"
 
-[tool.setuptools.dynamic]
-version = { attr = "cuda.cooperative._version.__version__" }
-readme = { file = ["README.md"], content-type = "text/markdown" }
+[tool.hatch.build.targets.wheel]
+packages = ["cuda"]
+
+[tool.hatch.version]
+path = "cuda/cooperative/_version.py"
+attr = "__version__"
+
+[tool.hatch.metadata]
+readme = "README.md"
+allow-direct-references = true
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
deleted file mode 100644
index d084d296287..00000000000
--- a/python/cuda_cooperative/setup.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-from setuptools import setup
-import shutil
-
-PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
-CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
-
-
-def copy_license():
-    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
-    dst = os.path.join(PROJECT_PATH, "LICENSE")
-    shutil.copy(src, dst)
-
-
-setup()

From 4a0cca1490e7809ed172a82db920f5d54bad5a09 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 20:52:28 -0800
Subject: [PATCH 24/66] Revert "Convert cuda_cooperative to use hatchling as
 build backend."

This reverts commit 61637d608da06fcf6851ef6197f88b5e7dbc3bbe.
---
 python/cuda_cooperative/pyproject.toml | 27 ++++++++++++--------------
 python/cuda_cooperative/setup.py       | 19 ++++++++++++++++++
 2 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 python/cuda_cooperative/setup.py

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index a03ee1ba783..1bdc12b2e7d 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -3,8 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
+requires = [
+    "setuptools>=61.0.0",
+    "wheel",
+    "packaging",
+    "numpy",
+]
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "cuda-cooperative"
@@ -14,14 +19,13 @@ dynamic = ["version", "readme"]
 authors = [
     { name = "NVIDIA Corporation" }
 ]
-license = { file = "../../LICENSE" }
+license = { file = "LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    "cuda-cccl @ {root:uri}/../cuda_cccl",
-    "numpy",
+    # "cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl",
     "numba>=0.60.0",
     "pynvjitlink-cu12>=0.2.4",
     "cuda-python",
@@ -37,13 +41,6 @@ test = [
 [project.urls]
 Homepage = "https://developer.nvidia.com/"
 
-[tool.hatch.build.targets.wheel]
-packages = ["cuda"]
-
-[tool.hatch.version]
-path = "cuda/cooperative/_version.py"
-attr = "__version__"
-
-[tool.hatch.metadata]
-readme = "README.md"
-allow-direct-references = true
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cooperative._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
new file mode 100644
index 00000000000..d084d296287
--- /dev/null
+++ b/python/cuda_cooperative/setup.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from setuptools import setup
+import shutil
+
+PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
+CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
+
+
+def copy_license():
+    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
+    dst = os.path.join(PROJECT_PATH, "LICENSE")
+    shutil.copy(src, dst)
+
+
+setup()

From 7dd3d1691254ee5a05a22509f5f8244f49d02c72 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 20:58:55 -0800
Subject: [PATCH 25/66] Move numpy from [build-system] requires -> [project]
 dependencies

---
 python/cuda_cooperative/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 1bdc12b2e7d..828cb93ac3e 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -7,7 +7,6 @@ requires = [
     "setuptools>=61.0.0",
     "wheel",
     "packaging",
-    "numpy",
 ]
 build-backend = "setuptools.build_meta"
 
@@ -26,6 +25,7 @@ classifiers = [
 ]
 dependencies = [
     # "cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl",
+    "numpy",
     "numba>=0.60.0",
     "pynvjitlink-cu12>=0.2.4",
     "cuda-python",

From efab5beeabfc2cd4eabe444cab2e06f6f76f51a1 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 21:14:33 -0800
Subject: [PATCH 26/66] Move pyproject.toml [project] dependencies -> setup.py
 install_requires, to be able to use CCCL_PATH

---
 python/cuda_cooperative/pyproject.toml | 10 +---------
 python/cuda_cooperative/setup.py       | 11 ++++++++++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 828cb93ac3e..f62b974ff62 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -14,7 +14,7 @@ build-backend = "setuptools.build_meta"
 name = "cuda-cooperative"
 description = "Experimental Core Library for CUDA Python"
 requires-python = ">=3.9"
-dynamic = ["version", "readme"]
+dynamic = ["version", "readme", "dependencies"]
 authors = [
     { name = "NVIDIA Corporation" }
 ]
@@ -23,14 +23,6 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
-dependencies = [
-    # "cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl",
-    "numpy",
-    "numba>=0.60.0",
-    "pynvjitlink-cu12>=0.2.4",
-    "cuda-python",
-    "jinja2",
-]
 
 [project.optional-dependencies]
 test = [
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index d084d296287..bd1ece8ff1f 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -16,4 +16,13 @@ def copy_license():
     shutil.copy(src, dst)
 
 
-setup()
+setup(
+    install_requires=[
+        f"cuda-cccl @ file://{CCCL_PATH}/python/cuda_cccl",
+        "numpy",
+        "numba>=0.60.0",
+        "pynvjitlink-cu12>=0.2.4",
+        "cuda-python",
+        "jinja2",
+    ],
+)

From 9fde3d154a62ee7707a289d3a5432dc2b2daa3a9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 21:52:27 -0800
Subject: [PATCH 27/66] Remove copy_license() and use
 license_files=["../../LICENSE"] instead.

---
 python/cuda_cccl/.gitignore            | 1 -
 python/cuda_cccl/pyproject.toml        | 1 -
 python/cuda_cccl/setup.py              | 8 +-------
 python/cuda_cooperative/pyproject.toml | 1 -
 python/cuda_cooperative/setup.py       | 8 +-------
 5 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
index 8d624328cd6..24ec757199f 100644
--- a/python/cuda_cccl/.gitignore
+++ b/python/cuda_cccl/.gitignore
@@ -1,3 +1,2 @@
-LICENSE
 cuda/cccl/include
 *egg-info
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 2e7afa9499e..9a42d93da15 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -16,7 +16,6 @@ description = "Experimental Package with CCCL headers to support JIT compilation
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { file = "LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 0fdf73e7f1f..98fc837959d 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -10,12 +10,6 @@
 CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
 
 
-def copy_license():
-    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
-    dst = os.path.join(PROJECT_PATH, "LICENSE")
-    shutil.copy(src, dst)
-
-
 def copy_cccl_headers_to_cuda_cccl_include():
     cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
     inc_path = os.path.join(PROJECT_PATH, "cuda", "cccl", "include")
@@ -31,10 +25,10 @@ def copy_cccl_headers_to_cuda_cccl_include():
         f.write("# Intentionally empty.\n")
 
 
-copy_license()
 copy_cccl_headers_to_cuda_cccl_include()
 
 setup(
+    license_files=["../../LICENSE"],
     packages=find_namespace_packages(include=["cuda.*"]),
     include_package_data=True,
 )
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index f62b974ff62..43aab4e4103 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -18,7 +18,6 @@ dynamic = ["version", "readme", "dependencies"]
 authors = [
     { name = "NVIDIA Corporation" }
 ]
-license = { file = "LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index bd1ece8ff1f..ff462f1054c 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -4,19 +4,13 @@
 
 import os
 from setuptools import setup
-import shutil
 
 PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
 CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
 
 
-def copy_license():
-    src = os.path.abspath(os.path.join(CCCL_PATH, "LICENSE"))
-    dst = os.path.join(PROJECT_PATH, "LICENSE")
-    shutil.copy(src, dst)
-
-
 setup(
+    license_files=["../../LICENSE"],
     install_requires=[
         f"cuda-cccl @ file://{CCCL_PATH}/python/cuda_cccl",
         "numpy",

From bda5d51f5acfecdf1b16c3d239165ef05016bb26 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 22:28:09 -0800
Subject: [PATCH 28/66] Further modernize cuda_cccl/setup.py to use pathlib

---
 python/cuda_cccl/setup.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 98fc837959d..f6d21a09ea4 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -2,27 +2,33 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from pathlib import Path
 from setuptools import setup, find_namespace_packages
-import os
 import shutil
 
-PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
-CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
+PROJECT_PATH = Path(__file__).resolve().parent
+CCCL_PATH = PROJECT_PATH.parents[1]
 
 
 def copy_cccl_headers_to_cuda_cccl_include():
-    cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-    inc_path = os.path.join(PROJECT_PATH, "cuda", "cccl", "include")
-    os.makedirs(inc_path, exist_ok=True)
+    cccl_headers = [
+        ("cub", "cub"),
+        ("libcudacxx", "include"),
+        ("thrust", "thrust"),
+    ]
+
+    inc_path = PROJECT_PATH / "cuda" / "cccl" / "include"
+    inc_path.mkdir(parents=True, exist_ok=True)
+
     for proj_dir, header_dir in cccl_headers:
-        src_path = os.path.abspath(os.path.join(CCCL_PATH, proj_dir, header_dir))
-        dst_path = os.path.join(inc_path, proj_dir)
-        if os.path.exists(dst_path):
+        src_path = CCCL_PATH / proj_dir / header_dir
+        dst_path = inc_path / proj_dir
+        if dst_path.exists():
             shutil.rmtree(dst_path)
         shutil.copytree(src_path, dst_path)
-    init_py_path = os.path.join(inc_path, "__init__.py")
-    with open(init_py_path, "w") as f:
-        f.write("# Intentionally empty.\n")
+
+    init_py_path = inc_path / "__init__.py"
+    init_py_path.write_text("# Intentionally empty.\n")
 
 
 copy_cccl_headers_to_cuda_cccl_include()

From 4e9720d85b67623189767f8ad17c82a01170b113 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 26 Dec 2024 22:32:09 -0800
Subject: [PATCH 29/66] Trivial simplifications in cuda_cccl/pyproject.toml

---
 python/cuda_cccl/pyproject.toml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 9a42d93da15..2e4600334a3 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -3,11 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = [
-    "setuptools>=61.0.0",
-    "wheel",
-    "packaging",
-]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -26,9 +22,6 @@ dynamic = ["version", "readme"]
 
 [project.urls]
 Homepage = "https://github.com/NVIDIA/cccl"
-Documentation = "https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl"
-Source = "https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl"
-Tracker = "https://github.com/NVIDIA/cccl/issues"
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.cccl._version.__version__" }

From c1aea17e4e08a61da72cf2b226051c50db554e42 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 20:09:40 -0800
Subject: [PATCH 30/66] Further simplify cuda_cccl/pyproject.toml, setup.py:
 remove inconsequential code

---
 python/cuda_cccl/pyproject.toml | 6 ------
 python/cuda_cccl/setup.py       | 4 +---
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 2e4600334a3..124abb73679 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -26,9 +26,3 @@ Homepage = "https://github.com/NVIDIA/cccl"
 [tool.setuptools.dynamic]
 version = { attr = "cuda.cccl._version.__version__" }
 readme = { file = ["README.md"], content-type = "text/markdown" }
-
-[tool.setuptools.package-data]
-"cuda" = ["cccl/include/**/*"]
-
-[tool.setuptools.exclude-package-data]
-"cuda" = ["cccl/include/__init__.py"]
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index f6d21a09ea4..4f84bee20bb 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from pathlib import Path
-from setuptools import setup, find_namespace_packages
+from setuptools import setup
 import shutil
 
 PROJECT_PATH = Path(__file__).resolve().parent
@@ -35,6 +35,4 @@ def copy_cccl_headers_to_cuda_cccl_include():
 
 setup(
     license_files=["../../LICENSE"],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    include_package_data=True,
 )

From d18d699cf7cfec0041669737f8564ada8135a41e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 20:18:01 -0800
Subject: [PATCH 31/66] Make cuda_cooperative/pyproject.toml more similar to
 cuda_cccl/pyproject.toml

---
 python/cuda_cooperative/pyproject.toml | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 43aab4e4103..0b791db6bea 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -3,25 +3,22 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = [
-    "setuptools>=61.0.0",
-    "wheel",
-    "packaging",
-]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "cuda-cooperative"
 description = "Experimental Core Library for CUDA Python"
-requires-python = ">=3.9"
-dynamic = ["version", "readme", "dependencies"]
 authors = [
-    { name = "NVIDIA Corporation" }
+    { name = "NVIDIA Corporation" },
 ]
 classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA",
+    "License :: OSI Approved :: Apache Software License",
 ]
+requires-python = ">=3.9"
+dynamic = ["version", "readme", "dependencies"]
 
 [project.optional-dependencies]
 test = [

From 9be94c6785372a27a3556b9df2c61295e971edda Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 21:02:19 -0800
Subject: [PATCH 32/66] Add taplo-pre-commit to .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 27e4a3ec4ea..54ad1368d82 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,6 +46,15 @@ repos:
       exclude: "^docs/tools/"
     - id: ruff-format
       exclude: "^docs/tools/"
+
+  - repo: https://github.com/nikaro/taplo-pre-commit
+    rev: main
+    hooks:
+      - id: taplo-lint
+        exclude: "^docs/"
+      - id: taplo-format
+        exclude: "^docs/"
+
   - repo: https://github.com/codespell-project/codespell
     rev: v2.3.0
     hooks:

From c2a9f24e1fac49c874ee981e7890203ba71eb9e0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 21:05:58 -0800
Subject: [PATCH 33/66] taplo-pre-commit auto-fixes

---
 python/cuda_cccl/pyproject.toml        | 10 ++++------
 python/cuda_cooperative/pyproject.toml | 15 +++++----------
 python/cuda_parallel/pyproject.toml    |  5 +----
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 124abb73679..dcc3097436d 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -9,13 +9,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cuda-cccl"
 description = "Experimental Package with CCCL headers to support JIT compilation"
-authors = [
-    { name = "NVIDIA Corporation" },
-]
+authors = [{ name = "NVIDIA Corporation" }]
 classifiers = [
-    "Programming Language :: Python :: 3 :: Only",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
 dynamic = ["version", "readme"]
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 0b791db6bea..635d5f665f2 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -9,22 +9,17 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cuda-cooperative"
 description = "Experimental Core Library for CUDA Python"
-authors = [
-    { name = "NVIDIA Corporation" },
-]
+authors = [{ name = "NVIDIA Corporation" }]
 classifiers = [
-    "Programming Language :: Python :: 3 :: Only",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
 dynamic = ["version", "readme", "dependencies"]
 
 [project.optional-dependencies]
-test = [
-    "pytest",
-    "pytest-xdist",
-]
+test = ["pytest", "pytest-xdist"]
 
 [project.urls]
 Homepage = "https://developer.nvidia.com/"
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index a07da6723ff..328fdf16611 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -10,9 +10,6 @@ build-backend = "setuptools.build_meta"
 python_version = "3.10"
 
 [[tool.mypy.overrides]]
-module = [
-    "numba.*",
-    "llvmlite"
-]
+module = ["numba.*", "llvmlite"]
 ignore_missing_imports = true
 follow_imports = "skip"

From c89d62077967cdb908ec8e9a944aaec218f84f80 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 21:16:15 -0800
Subject: [PATCH 34/66] Use pathlib in cuda_cooperative/setup.py

---
 python/cuda_cooperative/setup.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index ff462f1054c..152b62b38c6 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -2,12 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
+from pathlib import Path
 from setuptools import setup
 
-PROJECT_PATH = os.path.abspath(os.path.dirname(__file__))
-CCCL_PATH = os.path.abspath(os.path.join(PROJECT_PATH, "..", ".."))
-
+CCCL_PATH = Path(__file__).resolve().parents[2]
 
 setup(
     license_files=["../../LICENSE"],

From 1b3599bcfd52255daa83ffb0e0e7527992ce48eb Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 21:36:15 -0800
Subject: [PATCH 35/66] CCCL_PYTHON_PATH in cuda_cooperative/setup.py

---
 python/cuda_cooperative/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 152b62b38c6..5b56da58194 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -5,12 +5,12 @@
 from pathlib import Path
 from setuptools import setup
 
-CCCL_PATH = Path(__file__).resolve().parents[2]
+CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
 
 setup(
     license_files=["../../LICENSE"],
     install_requires=[
-        f"cuda-cccl @ file://{CCCL_PATH}/python/cuda_cccl",
+        f"cuda-cccl @ file://{CCCL_PYTHON_PATH}/cuda_cccl",
         "numpy",
         "numba>=0.60.0",
         "pynvjitlink-cu12>=0.2.4",

From 796b741973a1de7aae7c34b5fcd446a0c6dd6a91 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 22:00:37 -0800
Subject: [PATCH 36/66] Modernize cuda_parallel/pyproject.toml, setup.py

---
 python/cuda_parallel/pyproject.toml | 24 ++++++++++++++-
 python/cuda_parallel/setup.py       | 48 ++++++-----------------------
 2 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index 328fdf16611..8e536fe400a 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -3,9 +3,31 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "cuda-parallel"
+description = "Experimental Core Library for CUDA Python"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dynamic = ["version", "readme", "dependencies"]
+
+[project.optional-dependencies]
+test = ["pytest", "pytest-xdist", "cupy-cuda12x"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.parallel._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
 [tool.mypy]
 python_version = "3.10"
 
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 1cca7f1db8a..103260b74bc 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -3,25 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
+from pathlib import Path
 import subprocess
 
-from setuptools import Extension, setup, find_namespace_packages
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "parallel", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
-
-
-with open("README.md") as f:
-    long_description = f.read()
+CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
+CCCL_PATH = CCCL_PYTHON_PATH.parent
 
 
 class CMakeExtension(Extension):
@@ -45,7 +34,9 @@ def build_extension(self, ext):
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
 
-        subprocess.check_call(["cmake", cccl_path] + cmake_args, cwd=self.build_temp)
+        subprocess.check_call(
+            ["cmake", str(CCCL_PATH)] + cmake_args, cwd=self.build_temp
+        )
         subprocess.check_call(
             ["cmake", "--build", ".", "--target", "cccl.c.parallel"],
             cwd=self.build_temp,
@@ -53,36 +44,15 @@ def build_extension(self, ext):
 
 
 setup(
-    name="cuda-parallel",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
+    license_files=["../../LICENSE"],
     install_requires=[
-        f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl",
+        f"cuda-cccl @ file://{CCCL_PYTHON_PATH}/cuda_cccl",
         "numba>=0.60.0",
         "cuda-python",
         "jinja2",
     ],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-            "cupy-cuda12x",
-        ]
-    },
     cmdclass={
         "build_ext": BuildCMakeExtension,
     },
     ext_modules=[CMakeExtension("cuda.parallel.experimental.cccl.c")],
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
 )

From 9a63830401af0ffa2f26cde3b637a408bb1aaa3a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 22:14:43 -0800
Subject: [PATCH 37/66] Use pathlib in cuda_parallel/setup.py

---
 python/cuda_parallel/setup.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 103260b74bc..aef013f8401 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
 from pathlib import Path
 import subprocess
 
@@ -24,22 +23,20 @@ def run(self):
             self.build_extension(ext)
 
     def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        extdir = Path(self.get_ext_fullpath(ext.name)).resolve().parent
         cmake_args = [
             "-DCCCL_ENABLE_C=YES",
-            "-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+            f"-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY={extdir}",
             "-DCMAKE_BUILD_TYPE=Release",
         ]
 
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
+        build_temp_path = Path(self.build_temp)
+        build_temp_path.mkdir(parents=True, exist_ok=True)
 
-        subprocess.check_call(
-            ["cmake", str(CCCL_PATH)] + cmake_args, cwd=self.build_temp
-        )
+        subprocess.check_call(["cmake", CCCL_PATH] + cmake_args, cwd=build_temp_path)
         subprocess.check_call(
             ["cmake", "--build", ".", "--target", "cccl.c.parallel"],
-            cwd=self.build_temp,
+            cwd=build_temp_path,
         )
 
 

From 477fe3b5d1dac2703a08799ed8c7b0ecc64c5bdd Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 23:22:22 -0800
Subject: [PATCH 38/66] Add `# TOML lint & format` comment.

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 54ad1368d82..0f4a2872637 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,6 +47,7 @@ repos:
     - id: ruff-format
       exclude: "^docs/tools/"
 
+  # TOML lint & format
   - repo: https://github.com/nikaro/taplo-pre-commit
     rev: main
     hooks:

From 246ddf7d18856b8a38658d9f80c1ad5ad7608e67 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 23:36:59 -0800
Subject: [PATCH 39/66] Replace MANIFEST.in with
 `[tool.setuptools.package-data]` section in pyproject.toml

---
 python/cuda_cccl/MANIFEST.in    | 1 -
 python/cuda_cccl/pyproject.toml | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)
 delete mode 100644 python/cuda_cccl/MANIFEST.in

diff --git a/python/cuda_cccl/MANIFEST.in b/python/cuda_cccl/MANIFEST.in
deleted file mode 100644
index 55d6b5f63ba..00000000000
--- a/python/cuda_cccl/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/cccl/include *
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index dcc3097436d..294b34c1bee 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -24,3 +24,6 @@ Homepage = "https://github.com/NVIDIA/cccl"
 [tool.setuptools.dynamic]
 version = { attr = "cuda.cccl._version.__version__" }
 readme = { file = ["README.md"], content-type = "text/markdown" }
+
+[tool.setuptools.package-data]
+cuda = ["cccl/include/**/*"]

From e1fd264e484c216b401700e3430e0eb0c4283d4e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 27 Dec 2024 23:49:50 -0800
Subject: [PATCH 40/66] Use pathlib in cuda/cccl/include_paths.py

---
 python/cuda_cccl/cuda/cccl/include_paths.py | 33 +++++++++++----------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
index 3f1da5ef993..c984e5b4237 100644
--- a/python/cuda_cccl/cuda/cccl/include_paths.py
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -5,21 +5,24 @@
 from dataclasses import dataclass
 from functools import lru_cache
 import os
+from pathlib import Path
 import shutil
 from typing import Optional
 
 
-def _get_cuda_path() -> Optional[str]:
+def _get_cuda_path() -> Optional[Path]:
     cuda_path = os.environ.get("CUDA_PATH")
-    if cuda_path and os.path.exists(cuda_path):
-        return cuda_path
+    if cuda_path:
+        cuda_path = Path(cuda_path)
+        if cuda_path.exists():
+            return cuda_path
 
     nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
+    if nvcc_path:
+        return Path(nvcc_path).parent.parent
 
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
+    default_path = Path("/usr/local/cuda")
+    if default_path.exists():
         return default_path
 
     return None
@@ -27,10 +30,10 @@ def _get_cuda_path() -> Optional[str]:
 
 @dataclass
 class IncludePaths:
-    cuda: Optional[str]
-    libcudacxx: Optional[str]
-    cub: Optional[str]
-    thrust: Optional[str]
+    cuda: Optional[Path]
+    libcudacxx: Optional[Path]
+    cub: Optional[Path]
+    thrust: Optional[Path]
 
     def as_tuple(self):
         # Note: higher-level ... lower-level order:
@@ -46,15 +49,15 @@ def get_include_paths() -> IncludePaths:
     cuda_incl = None
     cuda_path = _get_cuda_path()
     if cuda_path is not None:
-        cuda_incl = os.path.join(cuda_path, "include")
+        cuda_incl = cuda_path / "include"
 
     with as_file(files("cuda.cccl.include")) as f:
-        cccl_incl = str(f)
-    assert os.path.exists(cccl_incl)
+        cccl_incl = Path(f)
+    assert cccl_incl.exists()
 
     return IncludePaths(
         cuda=cuda_incl,
-        libcudacxx=os.path.join(cccl_incl, "libcudacxx"),
+        libcudacxx=cccl_incl / "libcudacxx",
         cub=cccl_incl,
         thrust=cccl_incl,
     )

From 87b46ca16053f62cd9b5fc9a45f719e30a8c6464 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 28 Dec 2024 21:07:48 -0800
Subject: [PATCH 41/66] pre-commit autoupdate (EXCEPT clang-format, which was
 manually restored)

---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0f4a2872637..f088b2d1ce8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,7 +39,7 @@ repos:
   # TODO/REMINDER: add the Ruff vscode extension to the devcontainers
   # Ruff, the Python auto-correcting linter/formatter written in Rust
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.3
+    rev: v0.8.4
     hooks:
     - id: ruff
       args: ["--fix", "--show-fixes"]
@@ -49,7 +49,7 @@ repos:
 
   # TOML lint & format
   - repo: https://github.com/nikaro/taplo-pre-commit
-    rev: main
+    rev: 0.1.1
     hooks:
       - id: taplo-lint
         exclude: "^docs/"
@@ -70,7 +70,7 @@ repos:
 
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.13.0'
+    rev: 'v1.14.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools, numpy]

From eddc6cc133f3c6f67fe7cce99198b5d4184277be Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sun, 5 Jan 2025 23:38:38 -0800
Subject: [PATCH 42/66] Fixes after git merge main

---
 python/cuda_cccl/cuda/cccl/include_paths.py                   | 4 ++--
 python/cuda_cccl/setup.py                                     | 3 ++-
 .../cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py  | 3 ++-
 python/cuda_cooperative/setup.py                              | 1 +
 python/cuda_parallel/pyproject.toml                           | 2 +-
 python/cuda_parallel/setup.py                                 | 2 +-
 6 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
index c984e5b4237..0896d5836cc 100644
--- a/python/cuda_cccl/cuda/cccl/include_paths.py
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -2,11 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import os
+import shutil
 from dataclasses import dataclass
 from functools import lru_cache
-import os
 from pathlib import Path
-import shutil
 from typing import Optional
 
 
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 4f84bee20bb..740741f6e98 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import shutil
 from pathlib import Path
+
 from setuptools import setup
-import shutil
 
 PROJECT_PATH = Path(__file__).resolve().parent
 CCCL_PATH = PROJECT_PATH.parents[1]
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 7cae5c30d39..b317c1bb0c2 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -2,10 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import functools
+
 from cuda.bindings import nvrtc
 from cuda.cooperative.experimental._caching import disk_cache
 from cuda.cooperative.experimental._common import check_in, version
-import functools
 
 
 def CHECK_NVRTC(err, prog):
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 5b56da58194..37e1310ffb4 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from pathlib import Path
+
 from setuptools import setup
 
 CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index b374b66e57b..ecbbc5d014d 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -19,7 +19,7 @@ requires-python = ">=3.9"
 dynamic = ["version", "readme", "dependencies"]
 
 [project.optional-dependencies]
-test = ["pytest", "pytest-xdist", "cupy-cuda12x"]
+test = ["pytest", "pytest-xdist", "cupy-cuda12x", "typing_extensions"]
 
 [project.urls]
 Homepage = "https://developer.nvidia.com/"
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index aef013f8401..f300bbccc47 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from pathlib import Path
 import subprocess
+from pathlib import Path
 
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext

From bcf0de8caa97f0ca6f1395092fd56632ff1a9d81 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 6 Jan 2025 00:10:30 -0800
Subject: [PATCH 43/66] Resolve warning: AttributeError: '_Reduce' object has
 no attribute 'build_result'

```
=========================================================================== warnings summary ===========================================================================
tests/test_reduce.py::test_reduce_non_contiguous
  /home/coder/cccl/python/devenv/lib/python3.12/site-packages/_pytest/unraisableexception.py:85: PytestUnraisableExceptionWarning: Exception ignored in: <function _Reduce.__del__ at 0x7bf123139080>

  Traceback (most recent call last):
    File "/home/coder/cccl/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py", line 132, in __del__
      bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
                                                       ^^^^^^^^^^^^^^^^^
  AttributeError: '_Reduce' object has no attribute 'build_result'

    warnings.warn(pytest.PytestUnraisableExceptionWarning(msg))

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
============================================================= 1 passed, 93 deselected, 1 warning in 0.44s ==============================================================
```
---
 .../cuda/parallel/experimental/algorithms/reduce.py          | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index b99b5c4c9e1..41c0a3449e2 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -55,6 +55,9 @@ def __init__(
         op: Callable,
         h_init: np.ndarray,
     ):
+        # Referenced from __del__:
+        self.build_result = None
+
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
@@ -128,6 +131,8 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
         return temp_storage_bytes.value
 
     def __del__(self):
+        if self.build_result is None:
+            return
         bindings = get_bindings()
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 

From 71fd243baab28a84739406e05927854e9a863f1f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 7 Jan 2025 18:47:54 -0800
Subject: [PATCH 44/66] Move `copy_cccl_headers_to_cuda_cccl_include()`
 functionality to `class CustomBuildPy`

---
 python/cuda_cccl/setup.py | 46 ++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 740741f6e98..0f5050025c4 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -6,34 +6,46 @@
 from pathlib import Path
 
 from setuptools import setup
+from setuptools.command.build_py import build_py
 
 PROJECT_PATH = Path(__file__).resolve().parent
 CCCL_PATH = PROJECT_PATH.parents[1]
 
 
-def copy_cccl_headers_to_cuda_cccl_include():
-    cccl_headers = [
-        ("cub", "cub"),
-        ("libcudacxx", "include"),
-        ("thrust", "thrust"),
-    ]
+class CustomBuildPy(build_py):
+    """Copy CCCL headers BEFORE super().run()
 
-    inc_path = PROJECT_PATH / "cuda" / "cccl" / "include"
-    inc_path.mkdir(parents=True, exist_ok=True)
+    Note that the CCCL headers cannot be referenced directly:
+    setuptools (and pyproject.toml) does not support relative paths that
+    reference files outside the package directory (like ../../).
+    This is a restriction designed to avoid inadvertently packaging files
+    that are outside the source tree.
+    """
 
-    for proj_dir, header_dir in cccl_headers:
-        src_path = CCCL_PATH / proj_dir / header_dir
-        dst_path = inc_path / proj_dir
-        if dst_path.exists():
-            shutil.rmtree(dst_path)
-        shutil.copytree(src_path, dst_path)
+    def run(self):
+        cccl_headers = [
+            ("cub", "cub"),
+            ("libcudacxx", "include"),
+            ("thrust", "thrust"),
+        ]
 
-    init_py_path = inc_path / "__init__.py"
-    init_py_path.write_text("# Intentionally empty.\n")
+        inc_path = PROJECT_PATH / "cuda" / "cccl" / "include"
+        inc_path.mkdir(parents=True, exist_ok=True)
 
+        for proj_dir, header_dir in cccl_headers:
+            src_path = CCCL_PATH / proj_dir / header_dir
+            dst_path = inc_path / proj_dir
+            if dst_path.exists():
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+
+        init_py_path = inc_path / "__init__.py"
+        init_py_path.write_text("# Intentionally empty.\n")
+
+        super().run()
 
-copy_cccl_headers_to_cuda_cccl_include()
 
 setup(
     license_files=["../../LICENSE"],
+    cmdclass={"build_py": CustomBuildPy},
 )

From 79057cf0eda3623e201233c4b401f6e32b0a9b7e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 7 Jan 2025 23:58:55 -0800
Subject: [PATCH 45/66] Introduce cuda_cooperative/constraints.txt

---
 python/cuda_cooperative/constraints.txt |  1 +
 python/cuda_cooperative/pyproject.toml  | 10 +++++++++-
 python/cuda_cooperative/setup.py        | 12 ------------
 3 files changed, 10 insertions(+), 13 deletions(-)
 create mode 100644 python/cuda_cooperative/constraints.txt

diff --git a/python/cuda_cooperative/constraints.txt b/python/cuda_cooperative/constraints.txt
new file mode 100644
index 00000000000..a297d6cc4cc
--- /dev/null
+++ b/python/cuda_cooperative/constraints.txt
@@ -0,0 +1 @@
+cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 7e91891c8e3..62caab2265f 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -16,7 +16,15 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
-dynamic = ["version", "readme", "dependencies"]
+dependencies = [
+  "cuda-cccl",
+  "numpy",
+  "numba>=0.60.0",
+  "pynvjitlink-cu12>=0.2.4",
+  "cuda-python",
+  "jinja2",
+]
+dynamic = ["version", "readme"]
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-xdist"]
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 37e1310ffb4..c5461a6bab6 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -2,20 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from pathlib import Path
-
 from setuptools import setup
 
-CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
-
 setup(
     license_files=["../../LICENSE"],
-    install_requires=[
-        f"cuda-cccl @ file://{CCCL_PYTHON_PATH}/cuda_cccl",
-        "numpy",
-        "numba>=0.60.0",
-        "pynvjitlink-cu12>=0.2.4",
-        "cuda-python",
-        "jinja2",
-    ],
 )

From 46a83297b657c3a1a6ecbbf93ff344d879b37eb7 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 8 Jan 2025 20:50:46 -0800
Subject: [PATCH 46/66] Also add cuda_parallel/constraints.txt

---
 python/cuda_parallel/constraints.txt | 1 +
 python/cuda_parallel/pyproject.toml  | 3 ++-
 python/cuda_parallel/setup.py        | 6 ------
 3 files changed, 3 insertions(+), 7 deletions(-)
 create mode 100644 python/cuda_parallel/constraints.txt

diff --git a/python/cuda_parallel/constraints.txt b/python/cuda_parallel/constraints.txt
new file mode 100644
index 00000000000..a297d6cc4cc
--- /dev/null
+++ b/python/cuda_parallel/constraints.txt
@@ -0,0 +1 @@
+cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index ecbbc5d014d..2e66230a45c 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -16,7 +16,8 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
-dynamic = ["version", "readme", "dependencies"]
+dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python", "jinja2"]
+dynamic = ["version", "readme"]
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-xdist", "cupy-cuda12x", "typing_extensions"]
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index f300bbccc47..0e7750a8212 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -42,12 +42,6 @@ def build_extension(self, ext):
 
 setup(
     license_files=["../../LICENSE"],
-    install_requires=[
-        f"cuda-cccl @ file://{CCCL_PYTHON_PATH}/cuda_cccl",
-        "numba>=0.60.0",
-        "cuda-python",
-        "jinja2",
-    ],
     cmdclass={
         "build_ext": BuildCMakeExtension,
     },

From a07222b80b216f1e56d056b59c100ef5a716d1cb Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 8 Jan 2025 20:58:05 -0800
Subject: [PATCH 47/66] Add `--constraint constraints.txt` in ci/test_python.sh

---
 ci/test_python.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 89559712069..f8df309de32 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -19,14 +19,14 @@ popd >/dev/null
 
 pushd ../python/cuda_cooperative >/dev/null
 
-run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "⚙️  Pip install cuda_cooperative" pip install --constraint constraints.txt --force-reinstall --upgrade --target "${prefix}" .[test]
 run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
 
 popd >/dev/null
 
 pushd ../python/cuda_parallel >/dev/null
 
-run_command "⚙️  Pip install cuda_parallel" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "⚙️  Pip install cuda_parallel" pip install --constraint constraints.txt --force-reinstall --upgrade --target "${prefix}" .[test]
 run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
 
 popd >/dev/null

From b65f5105d91ec40e248d7c544040b137159469a1 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 14 Jan 2025 09:38:08 -0800
Subject: [PATCH 48/66] Update Copyright dates

---
 python/cuda_cccl/cuda/cccl/__init__.py      | 2 +-
 python/cuda_cccl/cuda/cccl/_version.py      | 2 +-
 python/cuda_cccl/cuda/cccl/include_paths.py | 2 +-
 python/cuda_cccl/pyproject.toml             | 2 +-
 python/cuda_cccl/setup.py                   | 2 +-
 python/cuda_cooperative/pyproject.toml      | 2 +-
 python/cuda_cooperative/setup.py            | 2 +-
 python/cuda_parallel/pyproject.toml         | 2 +-
 python/cuda_parallel/setup.py               | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py
index 977ba51caec..4eccba4ca64 100644
--- a/python/cuda_cccl/cuda/cccl/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py
index f9961cad366..ec7c29a266e 100644
--- a/python/cuda_cccl/cuda/cccl/_version.py
+++ b/python/cuda_cccl/cuda/cccl/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
index 0896d5836cc..da8246b9195 100644
--- a/python/cuda_cccl/cuda/cccl/include_paths.py
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 294b34c1bee..ada06301a4c 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
index 0f5050025c4..f6e5e3fa033 100644
--- a/python/cuda_cccl/setup.py
+++ b/python/cuda_cccl/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 62caab2265f..c7c4a2dcf6f 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index c5461a6bab6..b8dd6502515 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index 2e66230a45c..ba09c632a1f 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 0e7750a8212..c5c9fcd3c32 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 

From 47893d9542ffc239f5477df7ba713c6f2dd533c2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 14 Jan 2025 12:11:09 -0800
Subject: [PATCH 49/66] Switch to https://github.com/ComPWA/taplo-pre-commit
 (the other repo has been archived by the owner on Jul 1, 2024)

For completeness: The other repo took a long time to install into the pre-commit cache; so long it lead to timeouts in the CCCL CI.
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 457d0400633..16903b7c13f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -45,8 +45,8 @@ repos:
     - id: ruff-format  # formatter
 
   # TOML lint & format
-  - repo: https://github.com/nikaro/taplo-pre-commit
-    rev: 0.1.1
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
     hooks:
       - id: taplo-lint
         exclude: "^docs/"

From 324ac4feaac8603885d06054fcc9d3b19a4759b9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 14 Jan 2025 15:27:37 -0800
Subject: [PATCH 50/66] Remove unused cuda_parallel jinja2 dependency (noticed
 by chance).

---
 python/cuda_parallel/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index ba09c632a1f..fb95a7477da 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
-dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python", "jinja2"]
+dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python"]
 dynamic = ["version", "readme"]
 
 [project.optional-dependencies]

From e9048466dd1de223eadb33c7ab95e8d28160bcc4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 10:33:56 -0800
Subject: [PATCH 51/66] Remove constraints.txt files, advertise running `pip
 install cuda-cccl` first instead.

---
 python/cuda_cooperative/README.md       | 1 +
 python/cuda_cooperative/constraints.txt | 1 -
 python/cuda_parallel/README.md          | 1 +
 python/cuda_parallel/constraints.txt    | 1 -
 4 files changed, 2 insertions(+), 2 deletions(-)
 delete mode 100644 python/cuda_cooperative/constraints.txt
 delete mode 100644 python/cuda_parallel/constraints.txt

diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index c202d1d6c17..673e130bbe0 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_cooperative/constraints.txt b/python/cuda_cooperative/constraints.txt
deleted file mode 100644
index a297d6cc4cc..00000000000
--- a/python/cuda_cooperative/constraints.txt
+++ /dev/null
@@ -1 +0,0 @@
-cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index 98a3a3c92d0..1dad4b0f03e 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_parallel/constraints.txt b/python/cuda_parallel/constraints.txt
deleted file mode 100644
index a297d6cc4cc..00000000000
--- a/python/cuda_parallel/constraints.txt
+++ /dev/null
@@ -1 +0,0 @@
-cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl

From c1f571d09d76e6ba0f42fa2c36bd803b7258c62a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 11:06:54 -0800
Subject: [PATCH 52/66] Make cuda_cooperative, cuda_parallel testing completely
 independent.

---
 ci/test_python.sh | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index f8df309de32..08229ef75b9 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,26 +8,31 @@ print_environment_details
 
 fail_if_no_gpu
 
-readonly prefix="${BUILD_DIR}/python/"
-export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
-
-pushd ../python/cuda_cccl >/dev/null
-
-run_command "⚙️  Pip install cuda_cccl" pip install --force-reinstall --upgrade --target "${prefix}" .
-
-popd >/dev/null
+begin_group "⚙️ Existing site-packages"
+pip freeze
+end_group "⚙️ Existing site-packages"
 
 pushd ../python/cuda_cooperative >/dev/null
 
-run_command "⚙️  Pip install cuda_cooperative" pip install --constraint constraints.txt --force-reinstall --upgrade --target "${prefix}" .[test]
+rm -rf /tmp/cuda_cooperative_venv
+python -m venv /tmp/cuda_cooperative_venv
+. /tmp/cuda_cooperative_venv/bin/activate
+echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+run_command "⚙️  Pip install cuda_cooperative" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
 run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
+deactivate
 
 popd >/dev/null
 
 pushd ../python/cuda_parallel >/dev/null
 
-run_command "⚙️  Pip install cuda_parallel" pip install --constraint constraints.txt --force-reinstall --upgrade --target "${prefix}" .[test]
+rm -rf /tmp/cuda_parallel_venv
+python -m venv /tmp/cuda_parallel_venv
+. /tmp/cuda_parallel_venv/bin/activate
+echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+run_command "⚙️  Pip install cuda_parallel" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
 run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
+deactivate
 
 popd >/dev/null
 

From 695cc9b58fa48b8ff45b268d5fca255b41857966 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 11:41:12 -0800
Subject: [PATCH 53/66] Run only test_python.sh
 [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 ci/matrix.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 881f553f65d..13533deac52 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,6 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
+    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
 
   pull_request:
     # Old CTK/compiler

From ea33a218ed77a075156cd1b332047202adb25aa2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 11:55:55 -0800
Subject: [PATCH 54/66] Try using another runner (because V100 runners seem to
 be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 13533deac52..793a825aa9d 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
+    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'h100'}
 
   pull_request:
     # Old CTK/compiler

From d439f79dc7cbf628f3da42952a5862fe978e53ce Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 15 Jan 2025 21:57:16 +0100
Subject: [PATCH 55/66] Fix sign-compare warning (#3408)
 [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 cub/test/catch2_segmented_sort_helper.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
index eccb4cbcad4..f8a081a125a 100644
--- a/cub/test/catch2_segmented_sort_helper.cuh
+++ b/cub/test/catch2_segmented_sort_helper.cuh
@@ -250,7 +250,7 @@ public:
           auto const next_end =
             (uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1];
           REQUIRE(h_unique_keys_out[uniques_index] == i);
-          REQUIRE(next_end - h_unique_indexes_out[uniques_index] == segment_histogram[i]);
+          REQUIRE(next_end - h_unique_indexes_out[uniques_index] == static_cast<std::size_t>(segment_histogram[i]));
           current_offset += segment_histogram[i];
           uniques_index++;
         }

From 9a7b498c0970ac43975b73fb3245e9235ad75fa1 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 13:49:05 -0800
Subject: [PATCH 56/66] Revert "Try using another runner (because V100 runners
 seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]"

This reverts commit ea33a218ed77a075156cd1b332047202adb25aa2.

Error message: https://github.com/NVIDIA/cccl/pull/3201#issuecomment-2594012971
---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 793a825aa9d..13533deac52 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'h100'}
+    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
 
   pull_request:
     # Old CTK/compiler

From be3483414e24a1c98b5e69e3d967631077ee5ed6 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 13:59:51 -0800
Subject: [PATCH 57/66] Try using A100 runner (because V100 runners still seem
 to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 13533deac52..128acef4577 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
+    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'a100'}
 
   pull_request:
     # Old CTK/compiler

From b2b2b5b648a0927de0713cfae0372af270c66917 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 14:03:07 -0800
Subject: [PATCH 58/66] Also show cuda-cooperative site-packages, cuda-parallel
 site-packages (after pip install)
 [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 ci/test_python.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 08229ef75b9..290cfdbacec 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -19,6 +19,9 @@ python -m venv /tmp/cuda_cooperative_venv
 . /tmp/cuda_cooperative_venv/bin/activate
 echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
 run_command "⚙️  Pip install cuda_cooperative" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+begin_group "⚙️ cuda-cooperative site-packages"
+pip freeze
+end_group "⚙️ cuda-cooperative site-packages"
 run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
 deactivate
 
@@ -31,6 +34,9 @@ python -m venv /tmp/cuda_parallel_venv
 . /tmp/cuda_parallel_venv/bin/activate
 echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
 run_command "⚙️  Pip install cuda_parallel" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+begin_group "⚙️ cuda-parallel site-packages"
+pip freeze
+end_group "⚙️ cuda-parallel site-packages"
 run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
 deactivate
 

From 9f83b0daa9376987d9f910d5a48cee202d34b5a0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 14:14:22 -0800
Subject: [PATCH 59/66] Try using l4 runner (because V100 runners still seem to
 be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 128acef4577..b59e7364b62 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'a100'}
+    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'l4'}
 
   pull_request:
     # Old CTK/compiler

From 4807a796932a998885758909a78892f879c271f4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 16:36:25 -0800
Subject: [PATCH 60/66] Restore original ci/matrix.yaml [skip-rapids]

---
 ci/matrix.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index b59e7364b62..881f553f65d 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,6 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6', gpu: 'l4'}
 
   pull_request:
     # Old CTK/compiler

From d97a68a2049d9862c79d28a30d794dc35fae3cd2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 21:01:47 -0800
Subject: [PATCH 61/66] Use for loop in test_python.sh to avoid code
 duplication.

---
 ci/test_python.sh | 40 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 290cfdbacec..34900fdb8e0 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -12,34 +12,24 @@ begin_group "⚙️ Existing site-packages"
 pip freeze
 end_group "⚙️ Existing site-packages"
 
-pushd ../python/cuda_cooperative >/dev/null
-
-rm -rf /tmp/cuda_cooperative_venv
-python -m venv /tmp/cuda_cooperative_venv
-. /tmp/cuda_cooperative_venv/bin/activate
-echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
-run_command "⚙️  Pip install cuda_cooperative" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
-begin_group "⚙️ cuda-cooperative site-packages"
-pip freeze
-end_group "⚙️ cuda-cooperative site-packages"
-run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
-deactivate
+for module in cuda_parallel cuda_cooperative; do
 
-popd >/dev/null
+  pushd "../python/${module}" >/dev/null
 
-pushd ../python/cuda_parallel >/dev/null
+  TEMP_VENV_DIR="/tmp/${module}_venv"
+  rm -rf "${TEMP_VENV_DIR}"
+  python -m venv "${TEMP_VENV_DIR}"
+  . "${TEMP_VENV_DIR}/bin/activate"
+  echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+  run_command "⚙️  Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+  begin_group "⚙️ ${module} site-packages"
+  pip freeze
+  end_group "⚙️ ${module} site-packages"
+  run_command "🚀  Pytest ${module}" python -m pytest -v ./tests
+  deactivate
 
-rm -rf /tmp/cuda_parallel_venv
-python -m venv /tmp/cuda_parallel_venv
-. /tmp/cuda_parallel_venv/bin/activate
-echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
-run_command "⚙️  Pip install cuda_parallel" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
-begin_group "⚙️ cuda-parallel site-packages"
-pip freeze
-end_group "⚙️ cuda-parallel site-packages"
-run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
-deactivate
+  popd >/dev/null
 
-popd >/dev/null
+done
 
 print_time_summary

From ec206fd8b50a6a293e00a5825b579e125010b13d Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Jan 2025 11:41:12 -0800
Subject: [PATCH 62/66] Run only test_python.sh
 [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci]

---
 ci/matrix.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 881f553f65d..13533deac52 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,6 +8,7 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
+    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
 
   pull_request:
     # Old CTK/compiler

From f94bbb12ca323c1c8d50b4b05b162be8a4843614 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 16 Jan 2025 09:38:42 -0800
Subject: [PATCH 63/66] Comment out taplo-lint in pre-commit config
 [skip-rapids][skip-matx][skip-docs][skip-vdc]

---
 .pre-commit-config.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 16903b7c13f..e61d2f349ea 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,8 +48,9 @@ repos:
   - repo: https://github.com/ComPWA/taplo-pre-commit
     rev: v0.9.3
     hooks:
-      - id: taplo-lint
-        exclude: "^docs/"
+      # See https://github.com/NVIDIA/cccl/issues/3426
+      # - id: taplo-lint
+      #   exclude: "^docs/"
       - id: taplo-format
         exclude: "^docs/"
 

From b48f8660c54937ff0348f8bdbd3c5769b62ea8fb Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 16 Jan 2025 10:02:13 -0800
Subject: [PATCH 64/66] Revert "Run only test_python.sh
 [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci]"

This reverts commit ec206fd8b50a6a293e00a5825b579e125010b13d.
---
 ci/matrix.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 13533deac52..881f553f65d 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,7 +8,6 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: ['python'], ctk: '12.6'}
 
   pull_request:
     # Old CTK/compiler

From 917147f531805ac36ee7766589a10b7558b7ca62 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 16 Jan 2025 10:51:00 -0800
Subject: [PATCH 65/66] Implement suggestion by @shwina
 (https://github.com/NVIDIA/cccl/pull/3201#pullrequestreview-2556918460)

---
 python/cuda_cccl/cuda/cccl/__init__.py                         | 3 ++-
 .../cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py   | 2 +-
 python/cuda_parallel/cuda/parallel/experimental/_bindings.py   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py
index 4eccba4ca64..5288f071942 100644
--- a/python/cuda_cccl/cuda/cccl/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/__init__.py
@@ -3,5 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from cuda.cccl._version import __version__
+from cuda.cccl.include_paths import get_include_paths
 
-__all__ = ["__version__"]
+__all__ = ["__version__", "get_include_paths"]
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index b317c1bb0c2..a778a08f896 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -31,7 +31,7 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     opts = [b"--std=c++17"]
 
     # TODO: move this to a module-level import (after docs env modernization).
-    from cuda.cccl.include_paths import get_include_paths
+    from cuda.cccl import get_include_paths
 
     for path in get_include_paths().as_tuple():
         if path:
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index 0585fca9e71..ffc35ee2a87 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -7,7 +7,7 @@
 from functools import lru_cache
 from typing import List
 
-from cuda.cccl.include_paths import get_include_paths  # type: ignore[import-not-found]
+from cuda.cccl import get_include_paths  # type: ignore[import-not-found]
 
 from . import _cccl as cccl
 

From 12dbf295067dd5ca86d7b6dbc87c86ae67960250 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 16 Jan 2025 12:03:47 -0800
Subject: [PATCH 66/66] Address feedback by @leofang

---
 python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py | 2 +-
 python/cuda_cooperative/pyproject.toml                          | 2 +-
 python/cuda_parallel/pyproject.toml                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index a778a08f896..a1d269fd987 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -34,7 +34,7 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     from cuda.cccl import get_include_paths
 
     for path in get_include_paths().as_tuple():
-        if path:
+        if path is not None:
             opts += [f"--include-path={path}".encode("ascii")]
     opts += [f"--gpu-architecture=compute_{cc}".encode("ascii")]
     if rdc:
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index c7c4a2dcf6f..788e1e6d5d8 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
   "numpy",
   "numba>=0.60.0",
   "pynvjitlink-cu12>=0.2.4",
-  "cuda-python",
+  "cuda-python==12.*",
   "jinja2",
 ]
 dynamic = ["version", "readme"]
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index fb95a7477da..e7d2b9f0081 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
 ]
 requires-python = ">=3.9"
-dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python"]
+dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python==12.*"]
 dynamic = ["version", "readme"]
 
 [project.optional-dependencies]