From e8d57c3d0074fcb59f7bc2c2b209c58a01c03be0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 4 Oct 2024 09:13:49 +0700
Subject: [PATCH 1/9] [FEA]: Validate cuda.parallel type matching in build and
 execution (#2429)

* Brute force experiment: Which tests fail after adding an `assert False`?

* Do not include our own string.h file (#2444)

That might conflict with the host standard library

* Add `_dtype_validation()` in python/cuda_parallel/cuda/parallel/experimental/__init__.py and fix bug in python/cuda_parallel/tests/test_reduce_api.py

* Add `test_device_reduce_dtype_mismatch()`.  Capture `dtype`s only in ctor (not entire arrays).

* Undo change in .gitignore

* Move `min_op()` back into `test_device_reduce_success()` to unbreak sphinx documentation. Also fix existing typo.

* Move `test_device_reduce_dtype_mismatch()` from test_reduce_api.py to test_reduce.py

* Add TODO POINTER vs ITERATOR

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../cuda/parallel/experimental/__init__.py      | 13 ++++++++++++-
 python/cuda_parallel/tests/test_reduce.py       | 17 +++++++++++++++++
 python/cuda_parallel/tests/test_reduce_api.py   |  8 ++++----
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
index 4a16fc1b67a..0fa2d09df11 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -184,8 +184,16 @@ class _CCCLDeviceReduceBuildResult(ctypes.Structure):
                 ("reduction_kernel", ctypes.c_void_p)]
 
 
+def _dtype_validation(dt1, dt2):
+    if dt1 != dt2:
+        raise TypeError(f"dtype mismatch: __init__={dt1}, __call__={dt2}")
+
+
 class _Reduce:
     def __init__(self, d_in, d_out, op, init):
+        self._ctor_d_in_dtype = d_in.dtype
+        self._ctor_d_out_dtype = d_out.dtype
+        self._ctor_init_dtype = init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = _get_paths()
         bindings = _get_bindings()
@@ -212,7 +220,10 @@ def __init__(self, d_in, d_out, op, init):
             raise ValueError('Error building reduce')
 
     def __call__(self, temp_storage, d_in, d_out, init):
-        # TODO Assert that types match the ones used in the constructor
+        # TODO validate POINTER vs ITERATOR when iterator support is added
+        _dtype_validation(self._ctor_d_in_dtype, d_in.dtype)
+        _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
+        _dtype_validation(self._ctor_init_dtype, init.dtype)
         bindings = _get_bindings()
         if temp_storage is None:
             temp_storage_bytes = ctypes.c_size_t()
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index 9f59f8efcec..78c14b47931 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -66,3 +66,20 @@ def op(a, b):
         result = d_output.copy_to_host()[0]
         expected = numpy.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
+
+
+def test_device_reduce_dtype_mismatch():
+    def min_op(a, b):
+        return a if a < b else b
+
+    dtypes = [numpy.int32, numpy.int64]
+    h_inits = [numpy.array([], dt) for dt in dtypes]
+    h_inputs = [numpy.array([], dt) for dt in dtypes]
+    d_outputs = [cuda.device_array(1, dt) for dt in dtypes]
+    d_inputs = [cuda.to_device(h_inp) for h_inp in h_inputs]
+
+    reduce_into = cudax.reduce_into(d_inputs[0], d_outputs[0], min_op, h_inits[0])
+
+    for ix in range(3):
+        with pytest.raises(TypeError, match=r"^dtype mismatch: __init__=int32, __call__=int64$"):
+          reduce_into(None, d_inputs[int(ix == 0)], d_outputs[int(ix == 1)], h_inits[int(ix == 2)])
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index 6ed35831218..8c63364559c 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -13,19 +13,19 @@
 
 def test_device_reduce():
     # example-begin reduce-min
-    def op(a, b):
+    def min_op(a, b):
         return a if a < b else b
 
     dtype = numpy.int32
     h_init = numpy.array([42], dtype)
-    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9])
+    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9], dtype)
     d_output = cuda.device_array(1, dtype)
     d_input = cuda.to_device(h_input)
 
     # Instantiate reduction for the given operator and initial value
-    reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
+    reduce_into = cudax.reduce_into(d_output, d_output, min_op, h_init)
 
-    # Deterrmine temporary device storage requirements
+    # Determine temporary device storage requirements
     temp_storage_size = reduce_into(None, d_input, d_output, h_init)
 
     # Allocate temporary storage

From 583567bc90a3c3df6094f2ad5d64de451fc645c5 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Fri, 4 Oct 2024 12:49:39 -0700
Subject: [PATCH 2/9] avoid gcc optimizer bug by not force inlining part of
 `thrust::transform` (#2509)

---
 thrust/thrust/system/cuda/detail/transform.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 9e1d0b2a318..1926fb62473 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -178,8 +178,10 @@ struct binary_transform_f<InputIt1, InputIt2, OutputIt, no_stencil_tag, Transfor
   }
 }; // struct binary_transform_f
 
+// EAN 2024-10-04: when force-inlined, gcc's optimizer will generate bad code
+// for this function:
 template <class Policy, class InputIt, class Size, class OutputIt, class StencilIt, class TransformOp, class Predicate>
-OutputIt THRUST_FUNCTION unary(
+OutputIt _CCCL_HOST_DEVICE inline unary(
   Policy& policy,
   InputIt items,
   OutputIt result,
@@ -200,6 +202,8 @@ OutputIt THRUST_FUNCTION unary(
   return result + num_items;
 }
 
+// EAN 2024-10-04: when force-inlined, gcc's optimizer will generate bad code
+// for this function:
 template <class Policy,
           class InputIt1,
           class InputIt2,
@@ -208,7 +212,7 @@ template <class Policy,
           class StencilIt,
           class TransformOp,
           class Predicate>
-OutputIt THRUST_FUNCTION binary(
+OutputIt _CCCL_HOST_DEVICE inline binary(
   Policy& policy,
   InputIt1 items1,
   InputIt2 items2,

From c86cacae9b0e0b189c963466d68fec0fe69c0a88 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Sat, 5 Oct 2024 12:37:19 +0200
Subject: [PATCH 3/9] Cleanup and modularize `<cuda/std/barrier>` (#2443)

---
 .../include/cuda/__barrier/aligned_size.h     |   45 +
 .../__barrier/async_contract_fulfillment.h    |   35 +
 libcudacxx/include/cuda/__barrier/barrier.h   |   62 +
 .../cuda/__barrier/barrier_arrive_tx.h        |   94 ++
 .../cuda/__barrier/barrier_block_scope.h      |  465 ++++++
 .../cuda/__barrier/barrier_expect_tx.h        |   70 +
 .../cuda/__barrier/barrier_native_handle.h    |   37 +
 .../cuda/__barrier/barrier_thread_scope.h     |   57 +
 libcudacxx/include/cuda/__fwd/barrier.h       |   34 +
 .../cuda/__fwd/barrier_native_handle.h        |   38 +
 libcudacxx/include/cuda/__fwd/pipeline.h      |   33 +
 .../__memcpy_async/completion_mechanism.h     |   43 +
 .../cp_async_bulk_shared_global.h             |   56 +
 .../cuda/__memcpy_async/cp_async_fallback.h   |   68 +
 .../__memcpy_async/cp_async_shared_global.h   |  102 ++
 .../__memcpy_async/dispatch_memcpy_async.h    |  157 ++
 .../__memcpy_async/is_local_smem_barrier.h    |   45 +
 .../cuda/__memcpy_async/memcpy_async.h        |  166 +++
 .../__memcpy_async/memcpy_async_barrier.h     |  118 ++
 .../cuda/__memcpy_async/memcpy_async_tx.h     |   89 ++
 .../cuda/__memcpy_async/memcpy_completion.h   |  168 +++
 .../__memcpy_async/try_get_barrier_handle.h   |   54 +
 libcudacxx/include/cuda/barrier               |   16 +
 .../include/cuda/std/__barrier/barrier.h      |  228 +++
 .../cuda/std/__barrier/empty_completion.h     |   33 +
 .../include/cuda/std/__barrier/poll_tester.h  |   80 +
 libcudacxx/include/cuda/std/__cuda/barrier.h  | 1301 -----------------
 libcudacxx/include/cuda/std/barrier           |   31 +-
 .../cuda/std/detail/libcxx/include/__config   |    4 -
 .../detail/libcxx/include/__threading_support |    6 -
 .../cuda/std/detail/libcxx/include/barrier    |  459 ------
 31 files changed, 2415 insertions(+), 1779 deletions(-)
 create mode 100644 libcudacxx/include/cuda/__barrier/aligned_size.h
 create mode 100644 libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier_block_scope.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier_native_handle.h
 create mode 100644 libcudacxx/include/cuda/__barrier/barrier_thread_scope.h
 create mode 100644 libcudacxx/include/cuda/__fwd/barrier.h
 create mode 100644 libcudacxx/include/cuda/__fwd/barrier_native_handle.h
 create mode 100644 libcudacxx/include/cuda/__fwd/pipeline.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h
 create mode 100644 libcudacxx/include/cuda/std/__barrier/barrier.h
 create mode 100644 libcudacxx/include/cuda/std/__barrier/empty_completion.h
 create mode 100644 libcudacxx/include/cuda/std/__barrier/poll_tester.h
 delete mode 100644 libcudacxx/include/cuda/std/__cuda/barrier.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/barrier

diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h
new file mode 100644
index 00000000000..f0b863af76f
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/aligned_size.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H
+#define _CUDA___BARRIER_ALIGNED_SIZE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstddef>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <_CUDA_VSTD::size_t _Alignment>
+struct aligned_size_t
+{
+  static constexpr _CUDA_VSTD::size_t align = _Alignment;
+  _CUDA_VSTD::size_t value;
+
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
+      : value(__s)
+  {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
+  {
+    return value;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_ALIGNED_SIZE_H
diff --git a/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h b/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
new file mode 100644
index 00000000000..57e54f0b692
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
+#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// Type only used for logging purpose
+enum async_contract_fulfillment
+{
+  none,
+  async
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier.h b/libcudacxx/include/cuda/__barrier/barrier.h
new file mode 100644
index 00000000000..87bbff7ba50
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier.h
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_BARRIER_H
+#define _CUDA___BARRIER_BARRIER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__fwd/barrier.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/barrier.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__new_>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <thread_scope _Sco, class _CompletionF>
+class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
+{
+public:
+  _CCCL_HIDE_FROM_ABI barrier() = default;
+
+  barrier(const barrier&)            = delete;
+  barrier& operator=(const barrier&) = delete;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
+                                              _CompletionF __completion = _CompletionF())
+      : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
+  {
+    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
+    new (__b) barrier(__expected);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
+  {
+    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
+    new (__b) barrier(__expected, __completion);
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_BARRIER_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h b/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
new file mode 100644
index 00000000000..10fe5e1452c
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
@@ -0,0 +1,94 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_
+#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  if __cccl_ptx_isa >= 800
+
+#    include <cuda/__barrier/barrier_block_scope.h>
+#    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#    include <cuda/__ptx/ptx_dot_variants.h>
+#    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/std/__atomic/scopes.h>
+#    include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
+_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
+  barrier<thread_scope_block>& __b,
+  _CUDA_VSTD::ptrdiff_t __arrive_count_update,
+  _CUDA_VSTD::ptrdiff_t __transaction_count_update)
+{
+  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+  _CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
+  _CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
+  _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+  _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+  barrier<thread_scope_block>::arrival_token __token = {};
+  // On architectures pre-sm90, arrive_tx is not supported.
+  // We do not check for the statespace of the barrier here. This is
+  // on purpose. This allows debugging tools like memcheck/racecheck
+  // to detect that we are passing a pointer with the wrong state
+  // space to mbarrier.arrive. If we checked for the state space here,
+  // and __trap() if wrong, then those tools would not be able to help
+  // us in release builds. In debug builds, the error would be caught
+  // by the asserts at the top of this function.
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+
+      auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle);
+      if (__arrive_count_update == 1) {
+        __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
+          _CUDA_VPTX::sem_release,
+          _CUDA_VPTX::scope_cta,
+          _CUDA_VPTX::space_shared,
+          __native_handle,
+          __transaction_count_update);
+      } else {
+        asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+            :
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+            : "memory");
+        __token = _CUDA_VPTX::mbarrier_arrive(
+          _CUDA_VPTX::sem_release,
+          _CUDA_VPTX::scope_cta,
+          _CUDA_VPTX::space_shared,
+          __native_handle,
+          __arrive_count_update);
+      }),
+    (__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();));
+  return __token;
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+#  endif // __cccl_ptx_isa >= 800
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_
diff --git a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
new file mode 100644
index 00000000000..e794b7046fa
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
@@ -0,0 +1,465 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H
+#define _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__fwd/barrier.h>
+#include <cuda/__fwd/barrier_native_handle.h>
+#if defined(_CCCL_CUDA_COMPILER)
+#  include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#  include <cuda/__ptx/ptx_dot_variants.h>
+#  include <cuda/__ptx/ptx_helper_functions.h>
+#endif // _CCCL_CUDA_COMPILER
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/barrier.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__barrier/poll_tester.h>
+#include <cuda/std/__new_>
+#include <cuda/std/chrono>
+#include <cuda/std/cstdint>
+
+#include <nv/target>
+
+#if defined(_CCCL_COMPILER_NVRTC)
+#  define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type*) 0)->member))
+#else // ^^^ _CCCL_COMPILER_NVRTC ^^^ / vvv !_CCCL_COMPILER_NVRTC vvv
+#  define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !offsetof(type, member)
+#endif // _CCCL_COMPILER_NVRTC
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// Needed for pipeline.arrive_on
+struct __block_scope_barrier_base
+{};
+
+template <>
+class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __block_scope_barrier_base
+{
+  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>;
+  __barrier_base __barrier;
+
+  _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t*
+  device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block>& b);
+
+  template <typename _Barrier>
+  friend class _CUDA_VSTD::__barrier_poll_tester_phase;
+  template <typename _Barrier>
+  friend class _CUDA_VSTD::__barrier_poll_tester_parity;
+
+public:
+  using arrival_token           = typename __barrier_base::arrival_token;
+  _CCCL_HIDE_FROM_ABI barrier() = default;
+
+  barrier(const barrier&)            = delete;
+  barrier& operator=(const barrier&) = delete;
+
+  _LIBCUDACXX_HIDE_FROM_ABI barrier(_CUDA_VSTD::ptrdiff_t __expected,
+                                    _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
+  {
+    static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier),
+                  "fatal error: bad barrier layout");
+    init(this, __expected, __completion);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI ~barrier()
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        if (__isShared(&__barrier)) {
+          asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
+            __cvta_generic_to_shared(&__barrier)))
+                       : "memory");
+        } else if (__isClusterShared(&__barrier)) { __trap(); }),
+      NV_PROVIDES_SM_80,
+      (if (__isShared(&__barrier)) {
+        asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
+          __cvta_generic_to_shared(&__barrier)))
+                     : "memory");
+      }))
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(
+    barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CUDA_VSTD::__empty_completion = _CUDA_VSTD::__empty_completion())
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        if (__isShared(&__b->__barrier)) {
+          asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"(
+                         static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
+                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected))
+                       : "memory");
+        } else if (__isClusterShared(&__b->__barrier)) { __trap(); } else {
+          new (&__b->__barrier) __barrier_base(__expected);
+        }),
+      NV_PROVIDES_SM_80,
+      (
+        if (__isShared(&__b->__barrier)) {
+          asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"(
+                         static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
+                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected))
+                       : "memory");
+        } else { new (&__b->__barrier) __barrier_base(__expected); }),
+      NV_ANY_TARGET,
+      (new (&__b->__barrier) __barrier_base(__expected);))
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1)
+  {
+    _CCCL_ASSERT(__update >= 0, "Arrival count update must be non-negative.");
+    arrival_token __token = {};
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        if (!__isClusterShared(&__barrier)) { return __barrier.arrive(__update); } else if (!__isShared(&__barrier)) {
+          __trap();
+        }
+        // Cannot use cuda::device::barrier_native_handle here, as it is
+        // only defined for block-scope barriers. This barrier may be a
+        // non-block scoped barrier.
+        auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
+        __token   = _CUDA_VPTX::mbarrier_arrive(__bh, __update);),
+      NV_PROVIDES_SM_80,
+      (
+        if (!__isShared(&__barrier)) {
+          return __barrier.arrive(__update);
+        } auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
+        // Need 2 instructions, can't finish barrier with arrive > 1
+        if (__update > 1) { _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } __token =
+          _CUDA_VPTX::mbarrier_arrive(__bh);),
+      NV_PROVIDES_SM_70,
+      (
+        if (!__isShared(&__barrier)) { return __barrier.arrive(__update); }
+
+        unsigned int __mask    = __activemask();
+        unsigned int __activeA = __match_any_sync(__mask, __update);
+        unsigned int __activeB = __match_any_sync(__mask, reinterpret_cast<_CUDA_VSTD::uintptr_t>(&__barrier));
+        unsigned int __active  = __activeA & __activeB;
+        int __inc              = __popc(__active) * __update;
+
+        unsigned __laneid;
+        asm("mov.u32 %0, %%laneid;"
+            : "=r"(__laneid));
+        int __leader = __ffs(__active) - 1;
+        // All threads in mask synchronize here, establishing cummulativity to the __leader:
+        __syncwarp(__mask);
+        if (__leader == static_cast<int>(__laneid)) {
+          __token = __barrier.arrive(__inc);
+        } __token = __shfl_sync(__active, __token, __leader);),
+      NV_IS_HOST,
+      (__token = __barrier.arrive(__update);))
+    return __token;
+  }
+
+private:
+  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_sm_80(arrival_token __token) const
+  {
+    (void) __token;
+    int32_t __ready = 0;
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_80,
+      (asm volatile("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                    : "=r"(__ready)
+                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
+                    : "memory");))
+    return __ready;
+  }
+
+  // Document de drop > uint32_t for __nanosec on public for APIs
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token) const
+  {
+    (void) __token;
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        int32_t __ready = 0; if (!__isClusterShared(&__barrier)) {
+          return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));
+        } else if (!__isShared(&__barrier)) {
+          __trap();
+        } asm volatile("{\n\t"
+                       ".reg .pred p;\n\t"
+                       "mbarrier.try_wait.shared.b64 p, [%1], %2;\n\t"
+                       "selp.b32 %0, 1, 0, p;\n\t"
+                       "}"
+                       : "=r"(__ready)
+                       : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
+                       : "memory");
+        return __ready;),
+      NV_PROVIDES_SM_80,
+      (if (!__isShared(&__barrier)) {
+        return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));
+      } return __test_wait_sm_80(__token);),
+      NV_ANY_TARGET,
+      (return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));))
+  }
+
+  // Document de drop > uint32_t for __nanosec on public for APIs
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
+  {
+    if (__nanosec.count() < 1)
+    {
+      return __try_wait(_CUDA_VSTD::move(__token));
+    }
+
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        int32_t __ready = 0;
+        if (!__isClusterShared(&__barrier)) {
+          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+            _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
+        } else if (!__isShared(&__barrier)) { __trap(); }
+
+        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
+          _CUDA_VSTD::chrono::high_resolution_clock::now();
+        _CUDA_VSTD::chrono::nanoseconds __elapsed;
+        do {
+          const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count());
+          asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
+            "selp.b32 %0, 1, 0, p;\n\t"
+            "}"
+            : "=r"(__ready)
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+              "l"(__token),
+              "r"(__wait_nsec)
+            : "memory");
+          __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start;
+        } while (!__ready && (__nanosec > __elapsed));
+        return __ready;),
+      NV_PROVIDES_SM_80,
+      (
+        bool __ready = 0;
+        if (!__isShared(&__barrier)) {
+          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+            _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
+        }
+
+        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
+          _CUDA_VSTD::chrono::high_resolution_clock::now();
+        do {
+          __ready = __test_wait_sm_80(__token);
+        } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start));
+        return __ready;),
+      NV_ANY_TARGET,
+      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+                _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)),
+                _CUDA_VSTD::chrono::nanoseconds(__nanosec));))
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_parity_sm_80(bool __phase_parity) const
+  {
+    (void) __phase_parity;
+    uint16_t __ready = 0;
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_80,
+      (asm volatile(
+         "{"
+         ".reg .pred %%p;"
+         "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+         "selp.u16 %0, 1, 0, %%p;"
+         "}"
+         : "=h"(__ready)
+         : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<uint32_t>(__phase_parity))
+         : "memory");))
+    return __ready;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity) const
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        if (!__isClusterShared(&__barrier)) {
+          return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);
+        } else if (!__isShared(&__barrier)) { __trap(); } int32_t __ready = 0;
+
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t"
+          "selp.b32 %0, 1, 0, p;\n\t"
+          "}"
+          : "=r"(__ready)
+          : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+            "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity))
+          :);
+
+        return __ready;),
+      NV_PROVIDES_SM_80,
+      (if (!__isShared(&__barrier)) { return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); }
+
+       return __test_wait_parity_sm_80(__phase_parity);),
+      NV_ANY_TARGET,
+      (return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);))
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
+  {
+    if (__nanosec.count() < 1)
+    {
+      return __try_wait_parity(__phase_parity);
+    }
+
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        int32_t __ready = 0;
+        if (!__isClusterShared(&__barrier)) {
+          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+            _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
+        } else if (!__isShared(&__barrier)) { __trap(); }
+
+        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
+          _CUDA_VSTD::chrono::high_resolution_clock::now();
+        _CUDA_VSTD::chrono::nanoseconds __elapsed;
+        do {
+          const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count());
+          asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t"
+            "selp.b32 %0, 1, 0, p;\n\t"
+            "}"
+            : "=r"(__ready)
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)),
+              "r"(__wait_nsec)
+            : "memory");
+          __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start;
+        } while (!__ready && (__nanosec > __elapsed));
+
+        return __ready;),
+      NV_PROVIDES_SM_80,
+      (
+        bool __ready = 0;
+        if (!__isShared(&__barrier)) {
+          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+            _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
+        }
+
+        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
+          _CUDA_VSTD::chrono::high_resolution_clock::now();
+        do {
+          __ready = __test_wait_parity_sm_80(__phase_parity);
+        } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start));
+
+        return __ready;),
+      NV_ANY_TARGET,
+      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+                _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
+  }
+
+public:
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
+  {
+    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__phase)));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const
+  {
+    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
+  {
+    wait(arrive());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (
+        if (!__isClusterShared(&__barrier)) { return __barrier.arrive_and_drop(); } else if (!__isShared(&__barrier)) {
+          __trap();
+        }
+
+        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
+          __cvta_generic_to_shared(&__barrier)))
+                     : "memory");),
+      NV_PROVIDES_SM_80,
+      (
+        // Fallback to slowpath on device
+        if (!__isShared(&__barrier)) {
+          __barrier.arrive_and_drop();
+          return;
+        }
+
+        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
+          __cvta_generic_to_shared(&__barrier)))
+                     : "memory");),
+      NV_ANY_TARGET,
+      (
+        // Fallback to slowpath on device
+        __barrier.arrive_and_drop();))
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
+  {
+    return (1 << 20) - 1;
+  }
+
+  template <class _Rep, class _Period>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
+  try_wait_for(arrival_token&& __token, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
+  {
+    auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
+
+    return __try_wait(_CUDA_VSTD::move(__token), __nanosec);
+  }
+
+  template <class _Clock, class _Duration>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
+  try_wait_until(arrival_token&& __token, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
+  {
+    return try_wait_for(_CUDA_VSTD::move(__token), (__time - _Clock::now()));
+  }
+
+  template <class _Rep, class _Period>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
+  try_wait_parity_for(bool __phase_parity, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
+  {
+    auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
+
+    return __try_wait_parity(__phase_parity, __nanosec);
+  }
+
+  template <class _Clock, class _Duration>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
+  try_wait_parity_until(bool __phase_parity, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
+  {
+    return try_wait_parity_for(__phase_parity, (__time - _Clock::now()));
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
new file mode 100644
index 00000000000..e86b0e2d400
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
@@ -0,0 +1,70 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_BARRIER_EXPECT_TX_H_
+#define _CUDA_PTX_BARRIER_EXPECT_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  if __cccl_ptx_isa >= 800
+
+#    include <cuda/__barrier/barrier_block_scope.h>
+#    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__ptx/ptx_dot_variants.h>
+#    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/std/__atomic/scopes.h>
+#    include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
+_CCCL_DEVICE inline void
+barrier_expect_tx(barrier<thread_scope_block>& __b, _CUDA_VSTD::ptrdiff_t __transaction_count_update)
+{
+  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+  _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+  _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+  // We do not check for the statespace of the barrier here. This is
+  // on purpose. This allows debugging tools like memcheck/racecheck
+  // to detect that we are passing a pointer with the wrong state
+  // space to mbarrier.arrive. If we checked for the state space here,
+  // and __trap() if wrong, then those tools would not be able to help
+  // us in release builds. In debug builds, the error would be caught
+  // by the asserts at the top of this function.
+  // On architectures pre-sm90, arrive_tx is not supported.
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+     asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+         :
+         : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+           "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+         : "memory");),
+    (__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+#  endif // __cccl_ptx_isa >= 800
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX_BARRIER_EXPECT_TX_H_
diff --git a/libcudacxx/include/cuda/__barrier/barrier_native_handle.h b/libcudacxx/include/cuda/__barrier/barrier_native_handle.h
new file mode 100644
index 00000000000..29879c71edf
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier_native_handle.h
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H
+#define _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__fwd/barrier.h>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier<thread_scope_block>& __b)
+{
+  return reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__b.__barrier);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+#endif // _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h b/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h
new file mode 100644
index 00000000000..aa87dfa4b94
--- /dev/null
+++ b/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H
+#define _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__fwd/barrier.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <>
+class barrier<thread_scope_thread, _CUDA_VSTD::__empty_completion> : private barrier<thread_scope_block>
+{
+  using __base = barrier<thread_scope_block>;
+
+public:
+  using __base::__base;
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void
+  init(barrier* __b,
+       _CUDA_VSTD::ptrdiff_t __expected,
+       _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
+  {
+    init(static_cast<__base*>(__b), __expected, __completion);
+  }
+
+  using __base::arrive;
+  using __base::arrive_and_drop;
+  using __base::arrive_and_wait;
+  using __base::max;
+  using __base::wait;
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H
diff --git a/libcudacxx/include/cuda/__fwd/barrier.h b/libcudacxx/include/cuda/__fwd/barrier.h
new file mode 100644
index 00000000000..c2bc80929f9
--- /dev/null
+++ b/libcudacxx/include/cuda/__fwd/barrier.h
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FWD_BARRIER_H
+#define _CUDA___FWD_BARRIER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/empty_completion.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <thread_scope _Sco, class _CompletionF = _CUDA_VSTD::__empty_completion>
+class barrier;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FWD_BARRIER_H
diff --git a/libcudacxx/include/cuda/__fwd/barrier_native_handle.h b/libcudacxx/include/cuda/__fwd/barrier_native_handle.h
new file mode 100644
index 00000000000..2b024f559ff
--- /dev/null
+++ b/libcudacxx/include/cuda/__fwd/barrier_native_handle.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FWD_BARRIER_NATIVE_HANDLE_H
+#define _CUDA___FWD_BARRIER_NATIVE_HANDLE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__fwd/barrier.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/cstdint>
+
+_CCCL_NV_DIAG_SUPPRESS(821) // extern inline function was referenced but not defined
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier<thread_scope_block>& __b);
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+_CCCL_NV_DIAG_DEFAULT(821)
+
+#endif // _CUDA___FWD_BARRIER_NATIVE_HANDLE_H
diff --git a/libcudacxx/include/cuda/__fwd/pipeline.h b/libcudacxx/include/cuda/__fwd/pipeline.h
new file mode 100644
index 00000000000..02ec295da44
--- /dev/null
+++ b/libcudacxx/include/cuda/__fwd/pipeline.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FWD_PIPELINE_H
+#define _CUDA___FWD_PIPELINE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/scopes.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <thread_scope _Scope>
+class pipeline;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FWD_PIPELINE_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h b/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h
new file mode 100644
index 00000000000..1564e00a092
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_COMPLETION_MECHANISM_H
+#define _CUDA___BARRIER_COMPLETION_MECHANISM_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief __completion_mechanism allows memcpy_async to report back what completion
+//! mechanism it used. This is necessary to determine in which way to synchronize
+//! the memcpy_async with a sync object (barrier or pipeline).
+//
+//! In addition, we use this enum to create bit flags so that calling functions
+//! can specify which completion mechanisms can be used (__sync is always
+//! allowed).
+enum class __completion_mechanism
+{
+  __sync                 = 0,
+  __mbarrier_complete_tx = 1 << 0, // Use powers of two here to support the
+  __async_group          = 1 << 1, // bit flag use case
+  __async_bulk_group     = 1 << 2,
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_COMPLETION_MECHANISM_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
new file mode 100644
index 00000000000..94f11bf76f8
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
@@ -0,0 +1,56 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  if __cccl_ptx_isa >= 800
+
+#    include <cuda/__ptx/instructions/cp_async_bulk.h>
+#    include <cuda/__ptx/ptx_dot_variants.h>
+#    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/std/cstdint>
+
+#    include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
+template <typename _Group>
+inline __device__ void __cp_async_bulk_shared_global(
+  const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
+{
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+                    (if (__g.thread_rank() == 0) {
+                      _CUDA_VPTX::cp_async_bulk(
+                        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __src, __size, __bar_handle);
+                    }),
+                    (__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#  endif // __cccl_ptx_isa >= 800
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h
new file mode 100644
index 00000000000..1be497829cc
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h
@@ -0,0 +1,68 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstddef>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <_CUDA_VSTD::size_t _Copy_size>
+struct __copy_chunk
+{
+  _CCCL_ALIGNAS(_Copy_size) char data[_Copy_size];
+};
+
+template <_CUDA_VSTD::size_t _Alignment, typename _Group>
+inline _CCCL_HOST_DEVICE void
+__cp_async_fallback_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
+{
+  // Maximal copy size is 16 bytes
+  constexpr _CUDA_VSTD::size_t __copy_size = (_Alignment > 16) ? 16 : _Alignment;
+
+  using __chunk_t = __copy_chunk<__copy_size>;
+
+  // "Group"-strided loop over memory
+  const _CUDA_VSTD::size_t __stride = __g.size() * __copy_size;
+
+  // An unroll factor of 64 ought to be enough for anybody. This unroll pragma
+  // is mainly intended to place an upper bound on loop unrolling. The number
+  // is more than high enough for the intended use case: an unroll factor of
+  // 64 allows moving 4 * 64 * 256 = 64kb in one unrolled loop with 256
+  // threads (copying ints). On the other hand, in the unfortunate case that
+  // we have to move 1024 bytes / thread with char width, then we prevent
+  // fully unrolling the loop to 1024 copy instructions. This prevents the
+  // compile times from increasing unreasonably, and also has neglibible
+  // impact on runtime performance.
+  _LIBCUDACXX_PRAGMA_UNROLL(64)
+  for (_CUDA_VSTD::size_t __offset = __g.thread_rank() * __copy_size; __offset < __size; __offset += __stride)
+  {
+    __chunk_t tmp                                    = *reinterpret_cast<const __chunk_t*>(__src + __offset);
+    *reinterpret_cast<__chunk_t*>(__dest + __offset) = tmp;
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
new file mode 100644
index 00000000000..2266d5c96cc
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
@@ -0,0 +1,102 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+
+#  include <cuda/__ptx/ptx_dot_variants.h>
+#  include <cuda/__ptx/ptx_helper_functions.h>
+#  include <cuda/std/cstdint>
+
+#  include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
+template <size_t _Copy_size>
+inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
+{
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+  // If `if constexpr` is not available, this function gets instantiated even
+  // if is not called. Do not static_assert in that case.
+#  if _CCCL_STD_VER >= 2017
+  static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16,
+                "cp.async.shared.global requires a copy size of 4, 8, or 16.");
+#  endif // _CCCL_STD_VER >= 2017
+
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;"
+                  :
+                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+                    "n"(_Copy_size)
+                  : "memory");),
+    (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
+}
+
+template <>
+inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src)
+{
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+  // When copying 16 bytes, it is possible to skip L1 cache (.cg).
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;"
+                  :
+                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+                    "n"(16)
+                  : "memory");),
+    (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
+}
+
+template <size_t _Alignment, typename _Group>
+inline __device__ void
+__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
+{
+  // If `if constexpr` is not available, this function gets instantiated even
+  // if is not called. Do not static_assert in that case.
+#  if _CCCL_STD_VER >= 2017
+  static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment");
+#  endif // _CCCL_STD_VER >= 2017
+
+  // Maximal copy size is 16.
+  constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment;
+  // We use an int offset here, because we are copying to shared memory,
+  // which is easily addressable using int.
+  const int __group_size = __g.size();
+  const int __group_rank = __g.thread_rank();
+  const int __stride     = __group_size * __copy_size;
+  for (int __offset = __group_rank * __copy_size; __offset < static_cast<int>(__size); __offset += __stride)
+  {
+    __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
new file mode 100644
index 00000000000..cb8fcb69083
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
@@ -0,0 +1,157 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/cp_async_bulk_shared_global.h>
+#include <cuda/__memcpy_async/cp_async_fallback.h>
+#include <cuda/__memcpy_async/cp_async_shared_global.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/detail/libcxx/include/cstring>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+/***********************************************************************
+ * cuda::memcpy_async dispatch
+ *
+ * The dispatch mechanism takes all the arguments and dispatches to the
+ * fastest asynchronous copy mechanism available.
+ *
+ * It returns a __completion_mechanism that indicates which completion mechanism
+ * was used by the copy mechanism. This value can be used by the sync object to
+ * further synchronize if necessary.
+ *
+ ***********************************************************************/
+
+template <_CUDA_VSTD::size_t _Align, typename _Group>
+_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_any_to_any(
+  _Group const& __group,
+  char* __dest_char,
+  char const* __src_char,
+  _CUDA_VSTD::size_t __size,
+  _CUDA_VSTD::uint32_t __allowed_completions,
+  _CUDA_VSTD::uint64_t* __bar_handle)
+{
+  __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
+  return __completion_mechanism::__sync;
+}
+
+template <_CUDA_VSTD::size_t _Align, typename _Group>
+_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared(
+  _Group const& __group,
+  char* __dest_char,
+  char const* __src_char,
+  _CUDA_VSTD::size_t __size,
+  _CUDA_VSTD::uint32_t __allowed_completions,
+  _CUDA_VSTD::uint64_t* __bar_handle)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
+     (void) __can_use_complete_tx;
+     _CCCL_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle),
+                  "Pass non-null bar_handle if and only if can_use_complete_tx.");
+     _CCCL_IF_CONSTEXPR (_Align >= 16) {
+       if (__can_use_complete_tx && __isShared(__bar_handle))
+       {
+         __cp_async_bulk_shared_global(__group, __dest_char, __src_char, __size, __bar_handle);
+         return __completion_mechanism::__mbarrier_complete_tx;
+       }
+     }
+     // Fallthrough to SM 80..
+     ));
+#endif // __cccl_ptx_isa >= 800
+
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CCCL_IF_CONSTEXPR (_Align >= 4) {
+      const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
+      if (__can_use_async_group)
+      {
+        __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
+        return __completion_mechanism::__async_group;
+      }
+    }
+     // Fallthrough..
+     ));
+
+  __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
+  return __completion_mechanism::__sync;
+}
+
+// __dispatch_memcpy_async is the internal entry point for dispatching to the correct memcpy_async implementation.
+template <_CUDA_VSTD::size_t _Align, typename _Group>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
+  _Group const& __group,
+  char* __dest_char,
+  char const* __src_char,
+  _CUDA_VSTD::size_t __size,
+  _CUDA_VSTD::uint32_t __allowed_completions,
+  _CUDA_VSTD::uint64_t* __bar_handle)
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE,
+    (
+      // Dispatch based on direction of the copy: global to shared, shared to
+      // global, etc.
+
+      // CUDA compilers <= 12.2 may not propagate assumptions about the state space
+      // of pointers correctly. Therefore, we
+      // 1) put the code for each copy direction in a separate function, and
+      // 2) make sure none of the code paths can reach each other by "falling through".
+      //
+      // See nvbug 4074679 and also PR #478.
+      if (__isGlobal(__src_char) && __isShared(__dest_char)) {
+        return __dispatch_memcpy_async_global_to_shared<_Align>(
+          __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle);
+      } else {
+        return __dispatch_memcpy_async_any_to_any<_Align>(
+          __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle);
+      }),
+    (
+      // Host code path:
+      if (__group.thread_rank() == 0) {
+        memcpy(__dest_char, __src_char, __size);
+      } return __completion_mechanism::__sync;));
+}
+
+template <_CUDA_VSTD::size_t _Align, typename _Group>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
+  _Group const& __group,
+  char* __dest_char,
+  char const* __src_char,
+  _CUDA_VSTD::size_t __size,
+  _CUDA_VSTD::uint32_t __allowed_completions)
+{
+  _CCCL_ASSERT(!(__allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx)),
+               "Cannot allow mbarrier_complete_tx completion mechanism when not passing a barrier. ");
+  return __dispatch_memcpy_async<_Align>(__group, __dest_char, __src_char, __size, __allowed_completions, nullptr);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h
new file mode 100644
index 00000000000..c130d8c6736
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H
+#define _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/barrier.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief __is_local_smem_barrier returns true if barrier is (1) block-scoped and (2) located in shared memory.
+template <thread_scope _Sco,
+          typename _CompF,
+          bool _Is_mbarrier = (_Sco == thread_scope_block)
+                           && _CCCL_TRAIT(_CUDA_VSTD::is_same, _CompF, _CUDA_VSTD::__empty_completion)>
+_LIBCUDACXX_HIDE_FROM_ABI bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return _Is_mbarrier && __isShared(&__barrier);), (return false;));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
new file mode 100644
index 00000000000..3dc74bc5d96
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
@@ -0,0 +1,166 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+
+#  include <cuda/__barrier/aligned_size.h>
+#  include <cuda/__barrier/async_contract_fulfillment.h>
+#  include <cuda/__barrier/barrier.h>
+#  include <cuda/__barrier/barrier_block_scope.h>
+#  include <cuda/__barrier/barrier_thread_scope.h>
+#  include <cuda/__memcpy_async/memcpy_async_barrier.h>
+#  include <cuda/std/__atomic/scopes.h>
+#  include <cuda/std/__type_traits/void_t.h>
+#  include <cuda/std/cstddef>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+/***********************************************************************
+ * memcpy_async code:
+ *
+ * A call to cuda::memcpy_async(dest, src, size, barrier) can dispatch to any of
+ * these PTX instructions:
+ *
+ * 1. normal synchronous copy (fallback)
+ * 2. cp.async:      shared  <- global
+ * 3. cp.async.bulk: shared  <- global
+ * 4. TODO: cp.async.bulk: global  <- shared
+ * 5. TODO: cp.async.bulk: cluster <- shared
+ *
+ * Which of these options is chosen, depends on:
+ *
+ * 1. The alignment of dest, src, and size;
+ * 2. The direction of the copy
+ * 3. The current compute capability
+ * 4. The requested completion mechanism
+ *
+ * PTX has 3 asynchronous completion mechanisms:
+ *
+ * 1. Async group           - local to a thread. Used by cp.async
+ * 2. Bulk async group      - local to a thread. Used by cp.async.bulk (shared -> global)
+ * 3. mbarrier::complete_tx - shared memory barier. Used by cp.async.bulk (other directions)
+ *
+ * The code is organized as follows:
+ *
+ * 1. Asynchronous copy mechanisms that wrap the PTX instructions
+ *
+ * 2. Device memcpy_async implementation per copy direction (global to shared,
+ *    shared to global, etc). Dispatches to fastest mechanism based on requested
+ *    completion mechanism(s), pointer alignment, and architecture.
+ *
+ * 3. Host and device memcpy_async implementations. Host implementation is
+ *    basically a memcpy wrapper; device implementation dispatches based on the
+ *    direction of the copy.
+ *
+ * 4. __memcpy_async_barrier:
+ *    a) Sets the allowed completion mechanisms based on the barrier location
+ *    b) Calls the host or device memcpy_async implementation
+ *    c) If necessary, synchronizes with the barrier based on the returned
+ *    completion mechanism.
+ *
+ * 5. The public memcpy_async function overloads. Call into
+ *    __memcpy_async_barrier.
+ *
+ ***********************************************************************/
+
+/***********************************************************************
+ * Asynchronous copy mechanisms:
+ *
+ * 1. cp.async.bulk: shared  <- global
+ * 2. TODO: cp.async.bulk: cluster <- shared
+ * 3. TODO: cp.async.bulk: global  <- shared
+ * 4. cp.async:      shared  <- global
+ * 5. normal synchronous copy (fallback)
+ ***********************************************************************/
+
+template <typename _Group, class _Tp, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  _Tp* __destination,
+  _Tp const* __source,
+  aligned_size_t<_Alignment> __size,
+  barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
+}
+
+template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
+memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
+}
+
+template <typename _Group, class _Tp, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  _Tp* __destination,
+  _Tp const* __source,
+  _CUDA_VSTD::size_t __size,
+  barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
+}
+
+template <typename _Group, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  void* __destination,
+  void const* __source,
+  _CUDA_VSTD::size_t __size,
+  barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
+}
+
+template <typename _Group, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  void* __destination,
+  void const* __source,
+  aligned_size_t<_Alignment> __size,
+  barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
+}
+
+template <typename _Size, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
+memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
+{
+  return __memcpy_async_barrier(
+    __single_thread_group{},
+    reinterpret_cast<char*>(__destination),
+    reinterpret_cast<char const*>(__source),
+    __size,
+    __barrier);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
new file mode 100644
index 00000000000..ed9c68ad5a3
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
@@ -0,0 +1,118 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/barrier.h>
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__barrier/barrier_thread_scope.h>
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/dispatch_memcpy_async.h>
+#include <cuda/__memcpy_async/is_local_smem_barrier.h>
+#include <cuda/__memcpy_async/memcpy_completion.h>
+#include <cuda/__memcpy_async/try_get_barrier_handle.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+/***********************************************************************
+ * cuda::memcpy_async dispatch helper functions
+ *
+ * - __get_size_align struct to determine the alignment from a size type.
+ ***********************************************************************/
+
+// The __get_size_align struct provides a way to query the guaranteed
+// "alignment" of a provided size. In this case, an n-byte aligned size means
+// that the size is a multiple of n.
+//
+// Use as follows:
+// static_assert(__get_size_align<size_t>::align == 1)
+// static_assert(__get_size_align<aligned_size_t<n>>::align == n)
+
+// Default impl: always returns 1.
+template <typename, typename = void>
+struct __get_size_align
+{
+  static constexpr int align = 1;
+};
+
+// aligned_size_t<n> overload: return n.
+template <typename T>
+struct __get_size_align<T, _CUDA_VSTD::void_t<decltype(T::align)>>
+{
+  static constexpr int align = T::align;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct __single_thread_group
+{
+  _LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t size() const
+  {
+    return 1;
+  };
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t thread_rank() const
+  {
+    return 0;
+  };
+};
+
+template <typename _Group, class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
+  _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
+{
+  static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_trivially_copyable, _Tp), "memcpy_async requires a trivially copyable type");
+
+  // 1. Determine which completion mechanisms can be used with the current
+  // barrier. A local shared memory barrier, i.e., block-scope barrier in local
+  // shared memory, supports the mbarrier_complete_tx mechanism in addition to
+  // the async group mechanism.
+  _CUDA_VSTD::uint32_t __allowed_completions =
+    __is_local_smem_barrier(__barrier)
+      ? (_CUDA_VSTD::uint32_t(__completion_mechanism::__async_group)
+         | _CUDA_VSTD::uint32_t(__completion_mechanism::__mbarrier_complete_tx))
+      : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
+
+  // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
+  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
+  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  // Cast to char pointers. We don't need the type for alignment anymore and
+  // erasing the types reduces the number of instantiations of down-stream
+  // functions.
+  char* __dest_char      = reinterpret_cast<char*>(__destination);
+  char const* __src_char = reinterpret_cast<char const*>(__source);
+
+  // 2. Issue actual copy instructions.
+  auto __bh = __try_get_barrier_handle(__barrier);
+  auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions, __bh);
+
+  // 3. Synchronize barrier with copy instructions.
+  return __memcpy_completion_impl::__defer(__cm, __group, __size, __barrier);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
new file mode 100644
index 00000000000..5f242b8cf1c
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
@@ -0,0 +1,89 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_
+#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  if __cccl_ptx_isa >= 800
+
+#    include <cuda/__barrier/aligned_size.h>
+#    include <cuda/__barrier/async_contract_fulfillment.h>
+#    include <cuda/__barrier/barrier_block_scope.h>
+#    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__ptx/ptx_dot_variants.h>
+#    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/std/__atomic/scopes.h>
+#    include <cuda/std/__type_traits/is_trivially_copyable.h>
+#    include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
+template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
+_CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
+  _Tp* __dest,
+  const _Tp* __src,
+  ::cuda::aligned_size_t<_Alignment> __size,
+  ::cuda::barrier<::cuda::thread_scope_block>& __b)
+{
+  // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
+  // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
+  // memcpy_async when compiling with GCC 4.8.
+  // FIXME: remove the #if once GCC 4.8 is no longer supported.
+#    if !defined(_CCCL_COMPILER_GCC) || _GNUC_VER > 408
+  static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
+#    endif
+  static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+
+  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+  _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory.");
+  _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory.");
+
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      if (__isShared(__dest) && __isGlobal(__src)) {
+        _CUDA_VPTX::cp_async_bulk(
+          _CUDA_VPTX::space_cluster,
+          _CUDA_VPTX::space_global,
+          __dest,
+          __src,
+          static_cast<uint32_t>(__size),
+          barrier_native_handle(__b));
+      } else {
+        // memcpy_async_tx only supports copying from global to shared
+        // or from shared to remote cluster dsmem. To copy to remote
+        // dsmem, we need to arrive on a cluster-scoped barrier, which
+        // is not yet implemented. So we trap in this case as well.
+        _CCCL_UNREACHABLE();
+      }),
+    (__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();));
+
+  return async_contract_fulfillment::async;
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+#  endif // __cccl_ptx_isa >= 800
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
new file mode 100644
index 00000000000..9d9ea265da0
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H
+#define _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/async_contract_fulfillment.h>
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__barrier/barrier_expect_tx.h>
+#include <cuda/__fwd/pipeline.h>
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/is_local_smem_barrier.h>
+#include <cuda/__memcpy_async/try_get_barrier_handle.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/cstdint>
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  include <cuda/__ptx/ptx_dot_variants.h>
+#  include <cuda/__ptx/ptx_helper_functions.h>
+#endif // _CCCL_CUDA_COMPILER
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// This struct contains functions to defer the completion of a barrier phase
+// or pipeline stage until a specific memcpy_async operation *initiated by
+// this thread* has completed.
+
+// The user is still responsible for arriving and waiting on (or otherwise
+// synchronizing with) the barrier or pipeline barrier to see the results of
+// copies from other threads participating in the synchronization object.
+struct __memcpy_completion_impl
+{
+  template <typename _Group>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
+  __defer(__completion_mechanism __cm,
+          _Group const& __group,
+          _CUDA_VSTD::size_t __size,
+          barrier<::cuda::thread_scope_block>& __barrier)
+  {
+    // In principle, this is the overload for shared memory barriers. However, a
+    // block-scope barrier may also be located in global memory. Therefore, we
+    // check if the barrier is a non-smem barrier and handle that separately.
+    if (!__is_local_smem_barrier(__barrier))
+    {
+      return __defer_non_smem_barrier(__cm, __group, __size, __barrier);
+    }
+
+    switch (__cm)
+    {
+      case __completion_mechanism::__async_group:
+        // Pre-SM80, the async_group mechanism is not available.
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_80,
+          (
+            // Non-Blocking: unbalance barrier by 1, barrier will be
+            // rebalanced when all thread-local cp.async instructions
+            // have completed writing to shared memory.
+            _CUDA_VSTD::uint64_t* __bh = __try_get_barrier_handle(__barrier);
+
+            asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
+              __cvta_generic_to_shared(__bh)))
+                         : "memory");));
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__async_bulk_group:
+        // This completion mechanism should not be used with a shared
+        // memory barrier. Or at least, we do not currently envision
+        // bulk group to be used with shared memory barriers.
+        _CCCL_UNREACHABLE();
+      case __completion_mechanism::__mbarrier_complete_tx:
+#if __cccl_ptx_isa >= 800
+        // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (
+                       // Only perform the expect_tx operation with the leader thread
+                       if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); }));
+#endif // __cccl_ptx_isa >= 800
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__sync:
+        // sync: In this case, we do not need to do anything. The user will have
+        // to issue `bar.arrive_wait();` to see the effect of the transaction.
+        return async_contract_fulfillment::none;
+      default:
+        // Get rid of "control reaches end of non-void function":
+        _CCCL_UNREACHABLE();
+    }
+  }
+
+  template <typename _Group, thread_scope _Sco, typename _CompF>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer(
+    __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
+  {
+    return __defer_non_smem_barrier(__cm, __group, __size, __barrier);
+  }
+
+  template <typename _Group, thread_scope _Sco, typename _CompF>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer_non_smem_barrier(
+    __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
+  {
+    // Overload for non-smem barriers.
+    switch (__cm)
+    {
+      case __completion_mechanism::__async_group:
+        // Pre-SM80, the async_group mechanism is not available.
+        NV_IF_TARGET(NV_PROVIDES_SM_80,
+                     (
+                       // Blocking: wait for all thread-local cp.async instructions to have
+                       // completed writing to shared memory.
+                       asm volatile("cp.async.wait_all;" ::
+                                      : "memory");));
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__mbarrier_complete_tx:
+        // Non-smem barriers do not have an mbarrier_complete_tx mechanism..
+        _CCCL_UNREACHABLE();
+      case __completion_mechanism::__async_bulk_group:
+        // This completion mechanism is currently not expected to be used with barriers.
+        _CCCL_UNREACHABLE();
+      case __completion_mechanism::__sync:
+        // sync: In this case, we do not need to do anything.
+        return async_contract_fulfillment::none;
+      default:
+        // Get rid of "control reaches end of non-void function":
+        _CCCL_UNREACHABLE();
+    }
+  }
+
+  template <typename _Group, thread_scope _Sco>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
+  __defer(__completion_mechanism __cm, _Group const&, _CUDA_VSTD::size_t, pipeline<_Sco>&)
+  {
+    switch (__cm)
+    {
+      case __completion_mechanism::__async_group:
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__async_bulk_group:
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__mbarrier_complete_tx:
+        return async_contract_fulfillment::async;
+      case __completion_mechanism::__sync:
+        return async_contract_fulfillment::none;
+      default:
+        // Get rid of "control reaches end of non-void function":
+        _CCCL_UNREACHABLE();
+    }
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h
new file mode 100644
index 00000000000..d2207faf91d
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H
+#define _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/barrier.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise.
+template <thread_scope _Sco, typename _CompF>
+_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier)
+{
+  return nullptr;
+}
+
+template <>
+_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t*
+__try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_completion>(
+  barrier<thread_scope_block>& __barrier)
+{
+  (void) __barrier;
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE, (return ::cuda::device::barrier_native_handle(__barrier);), NV_ANY_TARGET, (return nullptr;));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H
diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index d10befdad26..0d65d4bf344 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -21,6 +21,22 @@
 #  pragma system_header
 #endif // no system header
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+#  error "CUDA synchronization primitives are only supported for sm_70 and up."
+#endif // __CUDA_ARCH__ < 700
+
+#ifdef _LIBCUDACXX_HAS_NO_THREADS
+#  error <cuda/barrier> is not supported on this single threaded system
+#endif // _LIBCUDACXX_HAS_NO_THREADS
+
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/__barrier/barrier.h>
+#include <cuda/__barrier/barrier_arrive_tx.h>
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__barrier/barrier_expect_tx.h>
+#include <cuda/__barrier/barrier_thread_scope.h>
+#include <cuda/__memcpy_async/memcpy_async.h>
+#include <cuda/__memcpy_async/memcpy_async_tx.h>
 #include <cuda/ptx>
 #include <cuda/std/barrier>
 
diff --git a/libcudacxx/include/cuda/std/__barrier/barrier.h b/libcudacxx/include/cuda/std/__barrier/barrier.h
new file mode 100644
index 00000000000..5956a49d24e
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__barrier/barrier.h
@@ -0,0 +1,228 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___BARRIER_BARRIER_H
+#define __LIBCUDACXX___BARRIER_BARRIER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__barrier/poll_tester.h>
+#include <cuda/std/__new_>
+#include <cuda/std/atomic>
+#include <cuda/std/chrono>
+#include <cuda/std/cstddef>
+
+#if _LIBCUDACXX_CUDA_ABI_VERSION < 3
+#  define _LIBCUDACXX_BARRIER_ALIGNMENTS alignas(64)
+#else // ^^^ _LIBCUDACXX_CUDA_ABI_VERSION < 3 ^^^ / vvv _LIBCUDACXX_CUDA_ABI_VERSION >= 3 vvv
+#  define _LIBCUDACXX_BARRIER_ALIGNMENTS
+#endif // _LIBCUDACXX_CUDA_ABI_VERSION >= 3
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _CompletionF, thread_scope _Sco = thread_scope_system>
+class __barrier_base
+{
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco> __phase;
+
+public:
+  using arrival_token = bool;
+
+private:
+  template <typename _Barrier>
+  friend class __barrier_poll_tester_phase;
+  template <typename _Barrier>
+  friend class __barrier_poll_tester_parity;
+  template <typename _Barrier>
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
+  template <typename _Barrier>
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
+  {
+    return __phase.load(memory_order_acquire) != __old;
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
+  {
+    return __try_wait(__parity);
+  }
+
+public:
+  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
+      : __expected(__expected)
+      , __arrived(__expected)
+      , __completion(__completion)
+      , __phase(false)
+  {}
+
+  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
+
+  __barrier_base(__barrier_base const&)            = delete;
+  __barrier_base& operator=(__barrier_base const&) = delete;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
+  {
+    auto const __old_phase    = __phase.load(memory_order_relaxed);
+    auto const __result       = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
+    auto const __new_expected = __expected.load(memory_order_relaxed);
+
+    _CCCL_ASSERT(__result >= 0, "");
+
+    if (0 == __result)
+    {
+      __completion();
+      __arrived.store(__new_expected, memory_order_relaxed);
+      __phase.store(!__old_phase, memory_order_release);
+      __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
+    }
+    return __old_phase;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const
+  {
+    __phase.wait(__old_phase, memory_order_acquire);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
+  {
+    wait(arrive());
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
+  {
+    __expected.fetch_sub(1, memory_order_relaxed);
+    (void) arrive();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
+  {
+    return numeric_limits<ptrdiff_t>::max();
+  }
+};
+
+template <thread_scope _Sco>
+class __barrier_base<__empty_completion, _Sco>
+{
+  static constexpr uint64_t __expected_unit = 1ull;
+  static constexpr uint64_t __arrived_unit  = 1ull << 32;
+  static constexpr uint64_t __expected_mask = __arrived_unit - 1;
+  static constexpr uint64_t __phase_bit     = 1ull << 63;
+  static constexpr uint64_t __arrived_mask  = (__phase_bit - 1) & ~__expected_mask;
+
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
+
+public:
+  using arrival_token = uint64_t;
+
+private:
+  template <typename _Barrier>
+  friend class __barrier_poll_tester_phase;
+  template <typename _Barrier>
+  friend class __barrier_poll_tester_parity;
+  template <typename _Barrier>
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
+  template <typename _Barrier>
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
+
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) noexcept
+  {
+#if _CCCL_STD_VER >= 2014
+    // This debug assert is not supported in C++11 due to resulting in a
+    // multi-statement constexpr function.
+    _CCCL_ASSERT(__count >= 0, "Count must be non-negative.");
+#endif // _CCCL_STD_VER >= 2014
+    return (((1u << 31) - __count) << 32) | ((1u << 31) - __count);
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_phase(uint64_t __phase) const
+  {
+    uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
+    return ((__current & __phase_bit) != __phase);
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
+  {
+    return __try_wait_phase(__old & __phase_bit);
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
+  {
+    return __try_wait_phase(__parity ? __phase_bit : 0);
+  }
+
+public:
+  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
+  __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
+      : __phase_arrived_expected(__init(__count))
+  {
+    _CCCL_ASSERT(__count >= 0, "");
+  }
+
+  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
+
+  __barrier_base(__barrier_base const&)            = delete;
+  __barrier_base& operator=(__barrier_base const&) = delete;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
+  {
+    auto const __inc = __arrived_unit * __update;
+    auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
+    if ((__old ^ (__old + __inc)) & __phase_bit)
+    {
+      __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed);
+      __phase_arrived_expected.notify_all();
+    }
+    return __old & __phase_bit;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
+  {
+    __libcpp_thread_poll_with_backoff(__barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const
+  {
+    __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
+  {
+    wait(arrive());
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
+  {
+    __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
+    (void) arrive();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
+  {
+    return numeric_limits<int32_t>::max();
+  }
+};
+
+template <class _CompletionF = __empty_completion>
+class barrier : public __barrier_base<_CompletionF>
+{
+public:
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
+      : __barrier_base<_CompletionF>(__count, __completion)
+  {}
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___BARRIER_BARRIER_H
diff --git a/libcudacxx/include/cuda/std/__barrier/empty_completion.h b/libcudacxx/include/cuda/std/__barrier/empty_completion.h
new file mode 100644
index 00000000000..7205748ccc7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__barrier/empty_completion.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H
+#define __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+struct __empty_completion
+{
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator()() noexcept {}
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H
diff --git a/libcudacxx/include/cuda/std/__barrier/poll_tester.h b/libcudacxx/include/cuda/std/__barrier/poll_tester.h
new file mode 100644
index 00000000000..6bcdb17e9ea
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__barrier/poll_tester.h
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___BARRIER_POLL_TESTER_H
+#define __LIBCUDACXX___BARRIER_POLL_TESTER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__utility/move.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Barrier>
+class __barrier_poll_tester_phase
+{
+  _Barrier const* __this;
+  typename _Barrier::arrival_token __phase;
+
+public:
+  _LIBCUDACXX_HIDE_FROM_ABI
+  __barrier_poll_tester_phase(_Barrier const* __this_, typename _Barrier::arrival_token&& __phase_)
+      : __this(__this_)
+      , __phase(_CUDA_VSTD::move(__phase_))
+  {}
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
+  {
+    return __this->__try_wait(__phase);
+  }
+};
+
+template <class _Barrier>
+class __barrier_poll_tester_parity
+{
+  _Barrier const* __this;
+  bool __parity;
+
+public:
+  _LIBCUDACXX_HIDE_FROM_ABI __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_)
+      : __this(__this_)
+      , __parity(__parity_)
+  {}
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
+  {
+    return __this->__try_wait_parity(__parity);
+  }
+};
+
+template <class _Barrier>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
+__call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase)
+{
+  return __b.__try_wait(_CUDA_VSTD::move(__phase));
+}
+
+template <class _Barrier>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait_parity(const _Barrier& __b, bool __parity)
+{
+  return __b.__try_wait_parity(__parity);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___BARRIER_POLL_TESTER_H
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
deleted file mode 100644
index 5f77bec44cd..00000000000
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ /dev/null
@@ -1,1301 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___CUDA_BARRIER_H
-#define _LIBCUDACXX___CUDA_BARRIER_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
-#  error "CUDA synchronization primitives are only supported for sm_70 and up."
-#endif
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__atomic/api/owned.h>
-#include <cuda/std/__type_traits/void_t.h> // _CUDA_VSTD::void_t
-
-#if defined(_CCCL_CUDA_COMPILER)
-#  include <cuda/ptx> // cuda::ptx::*
-#endif // _CCCL_CUDA_COMPILER
-
-#if defined(_CCCL_COMPILER_NVRTC)
-#  define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type*) 0)->member))
-#else
-#  define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !offsetof(type, member)
-#endif
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
-
-// foward declaration required for memcpy_async, pipeline "sync" defined here
-template <thread_scope _Scope>
-class pipeline;
-
-template <_CUDA_VSTD::size_t _Alignment>
-struct aligned_size_t
-{
-  static constexpr _CUDA_VSTD::size_t align = _Alignment;
-  _CUDA_VSTD::size_t value;
-  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
-      : value(__s)
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
-  {
-    return value;
-  }
-};
-
-// Type only used for logging purpose
-enum async_contract_fulfillment
-{
-  none,
-  async
-};
-
-// __completion_mechanism allows memcpy_async to report back what completion
-// mechanism it used. This is necessary to determine in which way to synchronize
-// the memcpy_async with a sync object (barrier or pipeline).
-//
-// In addition, we use this enum to create bit flags so that calling functions
-// can specify which completion mechanisms can be used (__sync is always
-// allowed).
-enum class __completion_mechanism
-{
-  __sync                 = 0,
-  __mbarrier_complete_tx = 1 << 0, // Use powers of two here to support the
-  __async_group          = 1 << 1, // bit flag use case
-  __async_bulk_group     = 1 << 2,
-};
-
-template <thread_scope _Sco, class _CompletionF = _CUDA_VSTD::__empty_completion>
-class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
-{
-public:
-  _CCCL_HIDE_FROM_ABI barrier() = default;
-
-  barrier(const barrier&)            = delete;
-  barrier& operator=(const barrier&) = delete;
-
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
-                                              _CompletionF __completion = _CompletionF())
-      : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
-  {}
-
-  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
-  {
-    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
-    new (__b) barrier(__expected);
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
-  {
-    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
-    new (__b) barrier(__expected, __completion);
-  }
-};
-
-struct __block_scope_barrier_base
-{};
-
-_LIBCUDACXX_END_NAMESPACE_CUDA
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
-
-_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier<thread_scope_block>& b);
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
-
-template <>
-class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __block_scope_barrier_base
-{
-  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>;
-  __barrier_base __barrier;
-
-  _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t*
-  device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block>& b);
-
-  template <typename _Barrier>
-  friend class _CUDA_VSTD::__barrier_poll_tester_phase;
-  template <typename _Barrier>
-  friend class _CUDA_VSTD::__barrier_poll_tester_parity;
-
-public:
-  using arrival_token           = typename __barrier_base::arrival_token;
-  _CCCL_HIDE_FROM_ABI barrier() = default;
-
-  barrier(const barrier&)            = delete;
-  barrier& operator=(const barrier&) = delete;
-
-  _LIBCUDACXX_HIDE_FROM_ABI barrier(_CUDA_VSTD::ptrdiff_t __expected,
-                                    _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
-  {
-    static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier),
-                  "fatal error: bad barrier layout");
-    init(this, __expected, __completion);
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI ~barrier()
-  {
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        if (__isShared(&__barrier)) {
-          asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-            __cvta_generic_to_shared(&__barrier)))
-                       : "memory");
-        } else if (__isClusterShared(&__barrier)) { __trap(); }),
-      NV_PROVIDES_SM_80,
-      (if (__isShared(&__barrier)) {
-        asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-          __cvta_generic_to_shared(&__barrier)))
-                     : "memory");
-      }))
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI friend void init(
-    barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CUDA_VSTD::__empty_completion = _CUDA_VSTD::__empty_completion())
-  {
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        if (__isShared(&__b->__barrier)) {
-          asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"(
-                         static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
-                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected))
-                       : "memory");
-        } else if (__isClusterShared(&__b->__barrier)) { __trap(); } else {
-          new (&__b->__barrier) __barrier_base(__expected);
-        }),
-      NV_PROVIDES_SM_80,
-      (
-        if (__isShared(&__b->__barrier)) {
-          asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"(
-                         static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
-                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected))
-                       : "memory");
-        } else { new (&__b->__barrier) __barrier_base(__expected); }),
-      NV_ANY_TARGET,
-      (new (&__b->__barrier) __barrier_base(__expected);))
-  }
-
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1)
-  {
-    _CCCL_ASSERT(__update >= 0, "Arrival count update must be non-negative.");
-    arrival_token __token = {};
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        if (!__isClusterShared(&__barrier)) { return __barrier.arrive(__update); } else if (!__isShared(&__barrier)) {
-          __trap();
-        }
-        // Cannot use cuda::device::barrier_native_handle here, as it is
-        // only defined for block-scope barriers. This barrier may be a
-        // non-block scoped barrier.
-        auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
-        __token   = _CUDA_VPTX::mbarrier_arrive(__bh, __update);),
-      NV_PROVIDES_SM_80,
-      (
-        if (!__isShared(&__barrier)) {
-          return __barrier.arrive(__update);
-        } auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
-        // Need 2 instructions, can't finish barrier with arrive > 1
-        if (__update > 1) { _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } __token =
-          _CUDA_VPTX::mbarrier_arrive(__bh);),
-      NV_IS_DEVICE,
-      (
-        if (!__isShared(&__barrier)) { return __barrier.arrive(__update); }
-
-        unsigned int __mask    = __activemask();
-        unsigned int __activeA = __match_any_sync(__mask, __update);
-        unsigned int __activeB = __match_any_sync(__mask, reinterpret_cast<_CUDA_VSTD::uintptr_t>(&__barrier));
-        unsigned int __active  = __activeA & __activeB;
-        int __inc              = __popc(__active) * __update;
-
-        unsigned __laneid;
-        asm("mov.u32 %0, %%laneid;"
-            : "=r"(__laneid));
-        int __leader = __ffs(__active) - 1;
-        // All threads in mask synchronize here, establishing cummulativity to the __leader:
-        __syncwarp(__mask);
-        if (__leader == static_cast<int>(__laneid)) {
-          __token = __barrier.arrive(__inc);
-        } __token = __shfl_sync(__active, __token, __leader);),
-      NV_IS_HOST,
-      (__token = __barrier.arrive(__update);))
-    return __token;
-  }
-
-private:
-  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_sm_80(arrival_token __token) const
-  {
-    (void) __token;
-    int32_t __ready = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_80,
-      (asm volatile("{\n\t"
-                    ".reg .pred p;\n\t"
-                    "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t"
-                    "selp.b32 %0, 1, 0, p;\n\t"
-                    "}"
-                    : "=r"(__ready)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
-                    : "memory");))
-    return __ready;
-  }
-
-  // Document de drop > uint32_t for __nanosec on public for APIs
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token) const
-  {
-    (void) __token;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        int32_t __ready = 0; if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));
-        } else if (!__isShared(&__barrier)) {
-          __trap();
-        } asm volatile("{\n\t"
-                       ".reg .pred p;\n\t"
-                       "mbarrier.try_wait.shared.b64 p, [%1], %2;\n\t"
-                       "selp.b32 %0, 1, 0, p;\n\t"
-                       "}"
-                       : "=r"(__ready)
-                       : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
-                       : "memory");
-        return __ready;),
-      NV_PROVIDES_SM_80,
-      (if (!__isShared(&__barrier)) {
-        return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));
-      } return __test_wait_sm_80(__token);),
-      NV_ANY_TARGET,
-      (return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));))
-  }
-
-  // Document de drop > uint32_t for __nanosec on public for APIs
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
-  {
-    if (__nanosec.count() < 1)
-    {
-      return __try_wait(_CUDA_VSTD::move(__token));
-    }
-
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        int32_t __ready = 0;
-        if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-            _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
-        } else if (!__isShared(&__barrier)) { __trap(); }
-
-        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
-          _CUDA_VSTD::chrono::high_resolution_clock::now();
-        _CUDA_VSTD::chrono::nanoseconds __elapsed;
-        do {
-          const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count());
-          asm volatile(
-            "{\n\t"
-            ".reg .pred p;\n\t"
-            "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
-            "selp.b32 %0, 1, 0, p;\n\t"
-            "}"
-            : "=r"(__ready)
-            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-              "l"(__token),
-              "r"(__wait_nsec)
-            : "memory");
-          __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start;
-        } while (!__ready && (__nanosec > __elapsed));
-        return __ready;),
-      NV_PROVIDES_SM_80,
-      (
-        bool __ready = 0;
-        if (!__isShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-            _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
-        }
-
-        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
-          _CUDA_VSTD::chrono::high_resolution_clock::now();
-        do {
-          __ready = __test_wait_sm_80(__token);
-        } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start));
-        return __ready;),
-      NV_ANY_TARGET,
-      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-                _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)),
-                _CUDA_VSTD::chrono::nanoseconds(__nanosec));))
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_parity_sm_80(bool __phase_parity) const
-  {
-    (void) __phase_parity;
-    uint16_t __ready = 0;
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_80,
-      (asm volatile(
-         "{"
-         ".reg .pred %%p;"
-         "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
-         "selp.u16 %0, 1, 0, %%p;"
-         "}"
-         : "=h"(__ready)
-         : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<uint32_t>(__phase_parity))
-         : "memory");))
-    return __ready;
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity) const
-  {
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);
-        } else if (!__isShared(&__barrier)) { __trap(); } int32_t __ready = 0;
-
-        asm volatile(
-          "{\n\t"
-          ".reg .pred p;\n\t"
-          "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t"
-          "selp.b32 %0, 1, 0, p;\n\t"
-          "}"
-          : "=r"(__ready)
-          : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-            "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity))
-          :);
-
-        return __ready;),
-      NV_PROVIDES_SM_80,
-      (if (!__isShared(&__barrier)) { return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); }
-
-       return __test_wait_parity_sm_80(__phase_parity);),
-      NV_ANY_TARGET,
-      (return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);))
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
-  {
-    if (__nanosec.count() < 1)
-    {
-      return __try_wait_parity(__phase_parity);
-    }
-
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        int32_t __ready = 0;
-        if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-            _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
-        } else if (!__isShared(&__barrier)) { __trap(); }
-
-        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
-          _CUDA_VSTD::chrono::high_resolution_clock::now();
-        _CUDA_VSTD::chrono::nanoseconds __elapsed;
-        do {
-          const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count());
-          asm volatile(
-            "{\n\t"
-            ".reg .pred p;\n\t"
-            "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t"
-            "selp.b32 %0, 1, 0, p;\n\t"
-            "}"
-            : "=r"(__ready)
-            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)),
-              "r"(__wait_nsec)
-            : "memory");
-          __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start;
-        } while (!__ready && (__nanosec > __elapsed));
-
-        return __ready;),
-      NV_PROVIDES_SM_80,
-      (
-        bool __ready = 0;
-        if (!__isShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-            _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
-        }
-
-        _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
-          _CUDA_VSTD::chrono::high_resolution_clock::now();
-        do {
-          __ready = __test_wait_parity_sm_80(__phase_parity);
-        } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start));
-
-        return __ready;),
-      NV_ANY_TARGET,
-      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-                _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
-  }
-
-public:
-  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
-  {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-      _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__phase)));
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const
-  {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-      _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
-  {
-    wait(arrive());
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
-  {
-    NV_DISPATCH_TARGET(
-      NV_PROVIDES_SM_90,
-      (
-        if (!__isClusterShared(&__barrier)) { return __barrier.arrive_and_drop(); } else if (!__isShared(&__barrier)) {
-          __trap();
-        }
-
-        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-          __cvta_generic_to_shared(&__barrier)))
-                     : "memory");),
-      NV_PROVIDES_SM_80,
-      (
-        // Fallback to slowpath on device
-        if (!__isShared(&__barrier)) {
-          __barrier.arrive_and_drop();
-          return;
-        }
-
-        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-          __cvta_generic_to_shared(&__barrier)))
-                     : "memory");),
-      NV_ANY_TARGET,
-      (
-        // Fallback to slowpath on device
-        __barrier.arrive_and_drop();))
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
-  {
-    return (1 << 20) - 1;
-  }
-
-  template <class _Rep, class _Period>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
-  try_wait_for(arrival_token&& __token, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
-  {
-    auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
-
-    return __try_wait(_CUDA_VSTD::move(__token), __nanosec);
-  }
-
-  template <class _Clock, class _Duration>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
-  try_wait_until(arrival_token&& __token, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
-  {
-    return try_wait_for(_CUDA_VSTD::move(__token), (__time - _Clock::now()));
-  }
-
-  template <class _Rep, class _Period>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
-  try_wait_parity_for(bool __phase_parity, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
-  {
-    auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
-
-    return __try_wait_parity(__phase_parity, __nanosec);
-  }
-
-  template <class _Clock, class _Duration>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
-  try_wait_parity_until(bool __phase_parity, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
-  {
-    return try_wait_parity_for(__phase_parity, (__time - _Clock::now()));
-  }
-};
-
-_LIBCUDACXX_END_NAMESPACE_CUDA
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
-
-_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier<thread_scope_block>& b)
-{
-  return reinterpret_cast<_CUDA_VSTD::uint64_t*>(&b.__barrier);
-}
-
-#if defined(_CCCL_CUDA_COMPILER)
-
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
-_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
-  barrier<thread_scope_block>& __b,
-  _CUDA_VSTD::ptrdiff_t __arrive_count_update,
-  _CUDA_VSTD::ptrdiff_t __transaction_count_update)
-{
-  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-  _CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
-  _CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
-  _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
-  _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
-
-  barrier<thread_scope_block>::arrival_token __token = {};
-  // On architectures pre-sm90, arrive_tx is not supported.
-  // We do not check for the statespace of the barrier here. This is
-  // on purpose. This allows debugging tools like memcheck/racecheck
-  // to detect that we are passing a pointer with the wrong state
-  // space to mbarrier.arrive. If we checked for the state space here,
-  // and __trap() if wrong, then those tools would not be able to help
-  // us in release builds. In debug builds, the error would be caught
-  // by the asserts at the top of this function.
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-
-      auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle);
-      if (__arrive_count_update == 1) {
-        __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
-          _CUDA_VPTX::sem_release,
-          _CUDA_VPTX::scope_cta,
-          _CUDA_VPTX::space_shared,
-          __native_handle,
-          __transaction_count_update);
-      } else {
-        asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-            :
-            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-            : "memory");
-        __token = _CUDA_VPTX::mbarrier_arrive(
-          _CUDA_VPTX::sem_release,
-          _CUDA_VPTX::scope_cta,
-          _CUDA_VPTX::space_shared,
-          __native_handle,
-          __arrive_count_update);
-      }),
-    (__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();));
-  return __token;
-}
-
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
-_CCCL_DEVICE inline void
-barrier_expect_tx(barrier<thread_scope_block>& __b, _CUDA_VSTD::ptrdiff_t __transaction_count_update)
-{
-  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-  _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
-  _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
-
-  // We do not check for the statespace of the barrier here. This is
-  // on purpose. This allows debugging tools like memcheck/racecheck
-  // to detect that we are passing a pointer with the wrong state
-  // space to mbarrier.arrive. If we checked for the state space here,
-  // and __trap() if wrong, then those tools would not be able to help
-  // us in release builds. In debug builds, the error would be caught
-  // by the asserts at the top of this function.
-  // On architectures pre-sm90, arrive_tx is not supported.
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-     asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-         :
-         : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-           "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-         : "memory");),
-    (__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();));
-}
-
-extern "C" _CCCL_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
-template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
-_CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
-  _Tp* __dest,
-  const _Tp* __src,
-  ::cuda::aligned_size_t<_Alignment> __size,
-  ::cuda::barrier<::cuda::thread_scope_block>& __b)
-{
-  // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
-  // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
-  // memcpy_async when compiling with GCC 4.8.
-  // FIXME: remove the #if once GCC 4.8 is no longer supported.
-#    if !defined(_CCCL_COMPILER_GCC) || _GNUC_VER > 408
-  static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
-#    endif
-  static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
-
-  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
-  _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory.");
-  _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory.");
-
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      if (__isShared(__dest) && __isGlobal(__src)) {
-        _CUDA_VPTX::cp_async_bulk(
-          _CUDA_VPTX::space_cluster,
-          _CUDA_VPTX::space_global,
-          __dest,
-          __src,
-          static_cast<uint32_t>(__size),
-          barrier_native_handle(__b));
-      } else {
-        // memcpy_async_tx only supports copying from global to shared
-        // or from shared to remote cluster dsmem. To copy to remote
-        // dsmem, we need to arrive on a cluster-scoped barrier, which
-        // is not yet implemented. So we trap in this case as well.
-        _CCCL_UNREACHABLE();
-      }),
-    (__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();));
-
-  return async_contract_fulfillment::async;
-}
-#  endif // __cccl_ptx_isa >= 800
-#endif // _CCCL_CUDA_COMPILER
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
-
-#if defined(_CCCL_CUDA_COMPILER)
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
-
-template <>
-class barrier<thread_scope_thread, _CUDA_VSTD::__empty_completion> : private barrier<thread_scope_block>
-{
-  using __base = barrier<thread_scope_block>;
-
-public:
-  using __base::__base;
-
-  _LIBCUDACXX_HIDE_FROM_ABI friend void
-  init(barrier* __b,
-       _CUDA_VSTD::ptrdiff_t __expected,
-       _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
-  {
-    init(static_cast<__base*>(__b), __expected, __completion);
-  }
-
-  using __base::arrive;
-  using __base::arrive_and_drop;
-  using __base::arrive_and_wait;
-  using __base::max;
-  using __base::wait;
-};
-
-template <typename... _Ty>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty...)
-{
-  return true;
-}
-
-template <typename _Ty>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty&)
-{
-  return true;
-}
-
-// __is_local_smem_barrier returns true if barrier is (1) block-scoped and (2) located in shared memory.
-template <thread_scope _Sco,
-          typename _CompF,
-          bool _Is_mbarrier = (_Sco == thread_scope_block)
-                           && _CUDA_VSTD::is_same<_CompF, _CUDA_VSTD::__empty_completion>::value>
-_LIBCUDACXX_HIDE_FROM_ABI bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier)
-{
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return _Is_mbarrier && __isShared(&__barrier);), (return false;));
-}
-
-// __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise.
-template <thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier)
-{
-  return nullptr;
-}
-
-template <>
-_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t*
-__try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_completion>(
-  barrier<::cuda::thread_scope_block>& __barrier)
-{
-  (void) __barrier;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE, (return ::cuda::device::barrier_native_handle(__barrier);), NV_ANY_TARGET, (return nullptr;));
-}
-
-// This struct contains functions to defer the completion of a barrier phase
-// or pipeline stage until a specific memcpy_async operation *initiated by
-// this thread* has completed.
-
-// The user is still responsible for arriving and waiting on (or otherwise
-// synchronizing with) the barrier or pipeline barrier to see the results of
-// copies from other threads participating in the synchronization object.
-struct __memcpy_completion_impl
-{
-  template <typename _Group>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
-  __defer(__completion_mechanism __cm,
-          _Group const& __group,
-          _CUDA_VSTD::size_t __size,
-          barrier<::cuda::thread_scope_block>& __barrier)
-  {
-    // In principle, this is the overload for shared memory barriers. However, a
-    // block-scope barrier may also be located in global memory. Therefore, we
-    // check if the barrier is a non-smem barrier and handle that separately.
-    if (!__is_local_smem_barrier(__barrier))
-    {
-      return __defer_non_smem_barrier(__cm, __group, __size, __barrier);
-    }
-
-    switch (__cm)
-    {
-      case __completion_mechanism::__async_group:
-        // Pre-SM80, the async_group mechanism is not available.
-        NV_IF_TARGET(
-          NV_PROVIDES_SM_80,
-          (
-            // Non-Blocking: unbalance barrier by 1, barrier will be
-            // rebalanced when all thread-local cp.async instructions
-            // have completed writing to shared memory.
-            _CUDA_VSTD::uint64_t* __bh = __try_get_barrier_handle(__barrier);
-
-            asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-              __cvta_generic_to_shared(__bh)))
-                         : "memory");));
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__async_bulk_group:
-        // This completion mechanism should not be used with a shared
-        // memory barrier. Or at least, we do not currently envision
-        // bulk group to be used with shared memory barriers.
-        _CCCL_UNREACHABLE();
-      case __completion_mechanism::__mbarrier_complete_tx:
-#  if __cccl_ptx_isa >= 800
-        // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
-        NV_IF_TARGET(NV_PROVIDES_SM_90,
-                     (
-                       // Only perform the expect_tx operation with the leader thread
-                       if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); }));
-#  endif // __cccl_ptx_isa >= 800
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__sync:
-        // sync: In this case, we do not need to do anything. The user will have
-        // to issue `bar.arrive_wait();` to see the effect of the transaction.
-        return async_contract_fulfillment::none;
-      default:
-        // Get rid of "control reaches end of non-void function":
-        _CCCL_UNREACHABLE();
-    }
-  }
-
-  template <typename _Group, thread_scope _Sco, typename _CompF>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer(
-    __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
-  {
-    return __defer_non_smem_barrier(__cm, __group, __size, __barrier);
-  }
-
-  template <typename _Group, thread_scope _Sco, typename _CompF>
-  _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer_non_smem_barrier(
-    __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
-  {
-    // Overload for non-smem barriers.
-
-    switch (__cm)
-    {
-      case __completion_mechanism::__async_group:
-        // Pre-SM80, the async_group mechanism is not available.
-        NV_IF_TARGET(NV_PROVIDES_SM_80,
-                     (
-                       // Blocking: wait for all thread-local cp.async instructions to have
-                       // completed writing to shared memory.
-                       asm volatile("cp.async.wait_all;" ::
-                                      : "memory");));
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__mbarrier_complete_tx:
-        // Non-smem barriers do not have an mbarrier_complete_tx mechanism..
-        _CCCL_UNREACHABLE();
-      case __completion_mechanism::__async_bulk_group:
-        // This completion mechanism is currently not expected to be used with barriers.
-        _CCCL_UNREACHABLE();
-      case __completion_mechanism::__sync:
-        // sync: In this case, we do not need to do anything.
-        return async_contract_fulfillment::none;
-      default:
-        // Get rid of "control reaches end of non-void function":
-        _CCCL_UNREACHABLE();
-    }
-  }
-
-  template <typename _Group, thread_scope _Sco>
-  _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
-  __defer(__completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, pipeline<_Sco>& __pipeline)
-  {
-    // pipeline does not sync on memcpy_async, defeat pipeline purpose otherwise
-    __unused(__pipeline);
-    __unused(__size);
-    __unused(__group);
-
-    switch (__cm)
-    {
-      case __completion_mechanism::__async_group:
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__async_bulk_group:
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__mbarrier_complete_tx:
-        return async_contract_fulfillment::async;
-      case __completion_mechanism::__sync:
-        return async_contract_fulfillment::none;
-      default:
-        // Get rid of "control reaches end of non-void function":
-        _CCCL_UNREACHABLE();
-    }
-  }
-};
-
-/***********************************************************************
- * memcpy_async code:
- *
- * A call to cuda::memcpy_async(dest, src, size, barrier) can dispatch to any of
- * these PTX instructions:
- *
- * 1. normal synchronous copy (fallback)
- * 2. cp.async:      shared  <- global
- * 3. cp.async.bulk: shared  <- global
- * 4. TODO: cp.async.bulk: global  <- shared
- * 5. TODO: cp.async.bulk: cluster <- shared
- *
- * Which of these options is chosen, depends on:
- *
- * 1. The alignment of dest, src, and size;
- * 2. The direction of the copy
- * 3. The current compute capability
- * 4. The requested completion mechanism
- *
- * PTX has 3 asynchronous completion mechanisms:
- *
- * 1. Async group           - local to a thread. Used by cp.async
- * 2. Bulk async group      - local to a thread. Used by cp.async.bulk (shared -> global)
- * 3. mbarrier::complete_tx - shared memory barier. Used by cp.async.bulk (other directions)
- *
- * The code is organized as follows:
- *
- * 1. Asynchronous copy mechanisms that wrap the PTX instructions
- *
- * 2. Device memcpy_async implementation per copy direction (global to shared,
- *    shared to global, etc). Dispatches to fastest mechanism based on requested
- *    completion mechanism(s), pointer alignment, and architecture.
- *
- * 3. Host and device memcpy_async implementations. Host implementation is
- *    basically a memcpy wrapper; device implementation dispatches based on the
- *    direction of the copy.
- *
- * 4. __memcpy_async_barrier:
- *    a) Sets the allowed completion mechanisms based on the barrier location
- *    b) Calls the host or device memcpy_async implementation
- *    c) If necessary, synchronizes with the barrier based on the returned
- *    completion mechanism.
- *
- * 5. The public memcpy_async function overloads. Call into
- *    __memcpy_async_barrier.
- *
- ***********************************************************************/
-
-/***********************************************************************
- * Asynchronous copy mechanisms:
- *
- * 1. cp.async.bulk: shared  <- global
- * 2. TODO: cp.async.bulk: cluster <- shared
- * 3. TODO: cp.async.bulk: global  <- shared
- * 4. cp.async:      shared  <- global
- * 5. normal synchronous copy (fallback)
- ***********************************************************************/
-
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
-template <typename _Group>
-inline __device__ void
-__cp_async_bulk_shared_global(const _Group& __g, char* __dest, const char* __src, size_t __size, uint64_t* __bar_handle)
-{
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
-                    (if (__g.thread_rank() == 0) {
-                      _CUDA_VPTX::cp_async_bulk(
-                        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __src, __size, __bar_handle);
-                    }),
-                    (__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
-template <size_t _Copy_size>
-inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
-{
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
-
-  // If `if constexpr` is not available, this function gets instantiated even
-  // if is not called. Do not static_assert in that case.
-#  if _CCCL_STD_VER >= 2017
-  static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16,
-                "cp.async.shared.global requires a copy size of 4, 8, or 16.");
-#  endif // _CCCL_STD_VER >= 2017
-
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;"
-                  :
-                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-                    "n"(_Copy_size)
-                  : "memory");),
-    (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
-}
-
-template <>
-inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src)
-{
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
-  // When copying 16 bytes, it is possible to skip L1 cache (.cg).
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;"
-                  :
-                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-                    "n"(16)
-                  : "memory");),
-    (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
-}
-
-template <size_t _Alignment, typename _Group>
-inline __device__ void
-__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
-{
-  // If `if constexpr` is not available, this function gets instantiated even
-  // if is not called. Do not static_assert in that case.
-#  if _CCCL_STD_VER >= 2017
-  static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment");
-#  endif // _CCCL_STD_VER >= 2017
-
-  // Maximal copy size is 16.
-  constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment;
-  // We use an int offset here, because we are copying to shared memory,
-  // which is easily addressable using int.
-  const int __group_size = __g.size();
-  const int __group_rank = __g.thread_rank();
-  const int __stride     = __group_size * __copy_size;
-  for (int __offset = __group_rank * __copy_size; __offset < static_cast<int>(__size); __offset += __stride)
-  {
-    __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
-  }
-}
-
-template <size_t _Copy_size>
-struct __copy_chunk
-{
-  _CCCL_ALIGNAS(_Copy_size) char data[_Copy_size];
-};
-
-template <size_t _Alignment, typename _Group>
-inline __host__ __device__ void
-__cp_async_fallback_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
-{
-  // Maximal copy size is 16 bytes
-  constexpr _CUDA_VSTD::size_t __copy_size = (_Alignment > 16) ? 16 : _Alignment;
-  using __chunk_t                          = __copy_chunk<__copy_size>;
-
-  // "Group"-strided loop over memory
-  const size_t __stride = __g.size() * __copy_size;
-
-  // An unroll factor of 64 ought to be enough for anybody. This unroll pragma
-  // is mainly intended to place an upper bound on loop unrolling. The number
-  // is more than high enough for the intended use case: an unroll factor of
-  // 64 allows moving 4 * 64 * 256 = 64kb in one unrolled loop with 256
-  // threads (copying ints). On the other hand, in the unfortunate case that
-  // we have to move 1024 bytes / thread with char width, then we prevent
-  // fully unrolling the loop to 1024 copy instructions. This prevents the
-  // compile times from increasing unreasonably, and also has neglibible
-  // impact on runtime performance.
-  _LIBCUDACXX_PRAGMA_UNROLL(64)
-  for (_CUDA_VSTD::size_t __offset = __g.thread_rank() * __copy_size; __offset < __size; __offset += __stride)
-  {
-    __chunk_t tmp                                    = *reinterpret_cast<const __chunk_t*>(__src + __offset);
-    *reinterpret_cast<__chunk_t*>(__dest + __offset) = tmp;
-  }
-}
-
-/***********************************************************************
- * cuda::memcpy_async dispatch helper functions
- *
- * - __get_size_align struct to determine the alignment from a size type.
- ***********************************************************************/
-
-// The __get_size_align struct provides a way to query the guaranteed
-// "alignment" of a provided size. In this case, an n-byte aligned size means
-// that the size is a multiple of n.
-//
-// Use as follows:
-// static_assert(__get_size_align<size_t>::align == 1)
-// static_assert(__get_size_align<aligned_size_t<n>>::align == n)
-
-// Default impl: always returns 1.
-template <typename, typename = void>
-struct __get_size_align
-{
-  static constexpr int align = 1;
-};
-
-// aligned_size_t<n> overload: return n.
-template <typename T>
-struct __get_size_align<T, _CUDA_VSTD::void_t<decltype(T::align)>>
-{
-  static constexpr int align = T::align;
-};
-
-/***********************************************************************
- * cuda::memcpy_async dispatch
- *
- * The dispatch mechanism takes all the arguments and dispatches to the
- * fastest asynchronous copy mechanism available.
- *
- * It returns a __completion_mechanism that indicates which completion mechanism
- * was used by the copy mechanism. This value can be used by the sync object to
- * further synchronize if necessary.
- *
- ***********************************************************************/
-
-template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_any_to_any(
-  _Group const& __group,
-  char* __dest_char,
-  char const* __src_char,
-  _CUDA_VSTD::size_t __size,
-  uint32_t __allowed_completions,
-  uint64_t* __bar_handle)
-{
-  __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
-  return __completion_mechanism::__sync;
-}
-
-template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared(
-  _Group const& __group,
-  char* __dest_char,
-  char const* __src_char,
-  _CUDA_VSTD::size_t __size,
-  uint32_t __allowed_completions,
-  uint64_t* __bar_handle)
-{
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
-     _LIBCUDACXX_UNUSED_VAR(__can_use_complete_tx);
-     _CCCL_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle),
-                  "Pass non-null bar_handle if and only if can_use_complete_tx.");
-     _CCCL_IF_CONSTEXPR (_Align >= 16) {
-       if (__can_use_complete_tx && __isShared(__bar_handle))
-       {
-         __cp_async_bulk_shared_global(__group, __dest_char, __src_char, __size, __bar_handle);
-         return __completion_mechanism::__mbarrier_complete_tx;
-       }
-     }
-     // Fallthrough to SM 80..
-     ));
-#  endif // __cccl_ptx_isa >= 800
-
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CCCL_IF_CONSTEXPR (_Align >= 4) {
-      const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
-      if (__can_use_async_group)
-      {
-        __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
-        return __completion_mechanism::__async_group;
-      }
-    }
-     // Fallthrough..
-     ));
-
-  __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size);
-  return __completion_mechanism::__sync;
-}
-
-// __dispatch_memcpy_async is the internal entry point for dispatching to the correct memcpy_async implementation.
-template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
-  _Group const& __group,
-  char* __dest_char,
-  char const* __src_char,
-  size_t __size,
-  _CUDA_VSTD::uint32_t __allowed_completions,
-  uint64_t* __bar_handle)
-{
-  NV_IF_ELSE_TARGET(
-    NV_IS_DEVICE,
-    (
-      // Dispatch based on direction of the copy: global to shared, shared to
-      // global, etc.
-
-      // CUDA compilers <= 12.2 may not propagate assumptions about the state space
-      // of pointers correctly. Therefore, we
-      // 1) put the code for each copy direction in a separate function, and
-      // 2) make sure none of the code paths can reach each other by "falling through".
-      //
-      // See nvbug 4074679 and also PR #478.
-      if (__isGlobal(__src_char) && __isShared(__dest_char)) {
-        return __dispatch_memcpy_async_global_to_shared<_Align>(
-          __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle);
-      } else {
-        return __dispatch_memcpy_async_any_to_any<_Align>(
-          __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle);
-      }),
-    (
-      // Host code path:
-      if (__group.thread_rank() == 0) {
-        memcpy(__dest_char, __src_char, __size);
-      } return __completion_mechanism::__sync;));
-}
-
-template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
-  _Group const& __group,
-  char* __dest_char,
-  char const* __src_char,
-  _CUDA_VSTD::size_t __size,
-  _CUDA_VSTD::uint32_t __allowed_completions)
-{
-  _CCCL_ASSERT(!(__allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx)),
-               "Cannot allow mbarrier_complete_tx completion mechanism when not passing a barrier. ");
-  return __dispatch_memcpy_async<_Align>(__group, __dest_char, __src_char, __size, __allowed_completions, nullptr);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct __single_thread_group
-{
-  _LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t size() const
-  {
-    return 1;
-  };
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t thread_rank() const
-  {
-    return 0;
-  };
-};
-
-template <typename _Group, class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
-  _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
-{
-  static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
-
-  // 1. Determine which completion mechanisms can be used with the current
-  // barrier. A local shared memory barrier, i.e., block-scope barrier in local
-  // shared memory, supports the mbarrier_complete_tx mechanism in addition to
-  // the async group mechanism.
-  _CUDA_VSTD::uint32_t __allowed_completions =
-    __is_local_smem_barrier(__barrier)
-      ? (_CUDA_VSTD::uint32_t(__completion_mechanism::__async_group)
-         | _CUDA_VSTD::uint32_t(__completion_mechanism::__mbarrier_complete_tx))
-      : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
-
-  // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
-  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
-  // Cast to char pointers. We don't need the type for alignment anymore and
-  // erasing the types reduces the number of instantiations of down-stream
-  // functions.
-  char* __dest_char      = reinterpret_cast<char*>(__destination);
-  char const* __src_char = reinterpret_cast<char const*>(__source);
-
-  // 2. Issue actual copy instructions.
-  auto __bh = __try_get_barrier_handle(__barrier);
-  auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions, __bh);
-
-  // 3. Synchronize barrier with copy instructions.
-  return __memcpy_completion_impl::__defer(__cm, __group, __size, __barrier);
-}
-
-template <typename _Group, class _Tp, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
-  _Group const& __group,
-  _Tp* __destination,
-  _Tp const* __source,
-  aligned_size_t<_Alignment> __size,
-  barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
-}
-
-template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
-memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
-}
-
-template <typename _Group, class _Tp, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
-  _Group const& __group,
-  _Tp* __destination,
-  _Tp const* __source,
-  _CUDA_VSTD::size_t __size,
-  barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
-}
-
-template <typename _Group, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
-  _Group const& __group,
-  void* __destination,
-  void const* __source,
-  _CUDA_VSTD::size_t __size,
-  barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(
-    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
-}
-
-template <typename _Group, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
-  _Group const& __group,
-  void* __destination,
-  void const* __source,
-  aligned_size_t<_Alignment> __size,
-  barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(
-    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
-}
-
-template <typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
-memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
-{
-  return __memcpy_async_barrier(
-    __single_thread_group{},
-    reinterpret_cast<char*>(__destination),
-    reinterpret_cast<char const*>(__source),
-    __size,
-    __barrier);
-}
-
-_LIBCUDACXX_END_NAMESPACE_CUDA
-
-#endif // _CCCL_CUDA_COMPILER
-
-#endif // _LIBCUDACXX___CUDA_BARRIER_H
diff --git a/libcudacxx/include/cuda/std/barrier b/libcudacxx/include/cuda/std/barrier
index 3eb61978768..f5b7f2c07e6 100644
--- a/libcudacxx/include/cuda/std/barrier
+++ b/libcudacxx/include/cuda/std/barrier
@@ -11,10 +11,6 @@
 #ifndef _CUDA_STD_BARRIER
 #define _CUDA_STD_BARRIER
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-#  error "CUDA synchronization primitives are only supported for sm_70 and up."
-#endif
-
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -25,10 +21,27 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/barrier>
-
-_CCCL_POP_MACROS
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+#  error "CUDA synchronization primitives are only supported for sm_70 and up."
+#endif // __CUDA_ARCH__ < 700
+
+#ifdef _LIBCUDACXX_HAS_NO_THREADS
+#  error <cuda/std/barrier> is not supported on this single threaded system
+#endif // _LIBCUDACXX_HAS_NO_THREADS
+
+#include <cuda/std/__barrier/barrier.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__barrier/poll_tester.h>
+
+//! TODO: Drop cuda only features
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/__barrier/barrier.h>
+#include <cuda/__barrier/barrier_arrive_tx.h>
+#include <cuda/__barrier/barrier_block_scope.h>
+#include <cuda/__barrier/barrier_expect_tx.h>
+#include <cuda/__barrier/barrier_thread_scope.h>
+#include <cuda/__fwd/pipeline.h>
+#include <cuda/__memcpy_async/memcpy_async.h>
+#include <cuda/__memcpy_async/memcpy_async_tx.h>
 
 #endif // _CUDA_STD_BARRIER
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 69610aae6b7..ee90af58e63 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -538,10 +538,6 @@ typedef __char32_t char32_t;
 #    define _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 #  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
-#  ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER
-#    define _LIBCUDACXX_HAS_NO_TREE_BARRIER
-#  endif // _LIBCUDACXX_HAS_NO_TREE_BARRIER
-
 #  ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
 #    define _LIBCUDACXX_HAS_NO_WCHAR_H
 #  endif // _LIBCUDACXX_HAS_NO_WCHAR_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
index a56bfa94b12..5240ff7702e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
@@ -685,12 +685,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __libcpp_contention_t* __libcpp_contention_state(void
 
 #  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
-#  if !defined(_LIBCUDACXX_HAS_NO_TREE_BARRIER) && !defined(_LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX)
-
-_CCCL_VISIBILITY_DEFAULT extern thread_local ptrdiff_t __libcpp_thread_favorite_barrier_index;
-
-#  endif
-
 #  ifndef __cuda_std__
 
 class _CCCL_TYPE_VISIBILITY_DEFAULT thread;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
deleted file mode 100644
index d7b3cda99af..00000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ /dev/null
@@ -1,459 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- barrier ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_BARRIER
-#define _LIBCUDACXX_BARRIER
-
-/*
-    barrier synopsis
-
-namespace std
-{
-
-  template<class CompletionFunction = see below>
-  class barrier
-  {
-  public:
-    using arrival_token = see below;
-
-    constexpr explicit barrier(ptrdiff_t phase_count,
-                               CompletionFunction f = CompletionFunction());
-    ~barrier();
-
-    barrier(const barrier&) = delete;
-    barrier& operator=(const barrier&) = delete;
-
-    [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1);
-    void wait(arrival_token&& arrival) const;
-
-    void arrive_and_wait();
-    void arrive_and_drop();
-
-  private:
-    CompletionFunction __completion; // exposition only
-  };
-
-}
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__new_>
-#include <cuda/std/atomic>
-#include <cuda/std/chrono>
-#include <cuda/std/cstddef>
-
-_CCCL_PUSH_MACROS
-
-#ifdef _LIBCUDACXX_HAS_NO_THREADS
-#  error <barrier> is not supported on this single threaded system
-#endif
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-struct __empty_completion
-{
-  _LIBCUDACXX_HIDE_FROM_ABI void operator()() noexcept {}
-};
-
-#ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER
-
-template <class _CompletionF = __empty_completion, int _Sco = 0>
-class alignas(64) __barrier_base
-{
-  ptrdiff_t __expected;
-  __atomic_impl<ptrdiff_t, _Sco> __expected_adjustment;
-  _CompletionF __completion;
-
-  using __phase_t = uint8_t;
-  __atomic_impl<__phase_t, _Sco> __phase;
-
-  struct alignas(64) __state_t
-  {
-    struct
-    {
-      __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0);
-    } __tickets[64];
-  };
-  ::std::vector<__state_t> __state;
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool __arrive(__phase_t const __old_phase)
-  {
-    __phase_t const __half_step = __old_phase + 1, __full_step = __old_phase + 2;
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX
-    ptrdiff_t __current = __libcpp_thread_favorite_barrier_index,
-#  else
-    ptrdiff_t __current = 0,
-#  endif
-              __current_expected = __expected, __last_node = (__current_expected >> 1);
-    for (size_t __round = 0;; ++__round)
-    {
-      _CCCL_ASSERT(__round <= 63, "");
-      if (__current_expected == 1)
-      {
-        return true;
-      }
-      for (;; ++__current)
-      {
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX
-        if (0 == __round)
-        {
-          if (__current >= __current_expected)
-          {
-            __current = 0;
-          }
-          __libcpp_thread_favorite_barrier_index = __current;
-        }
-#  endif
-        _CCCL_ASSERT(__current <= __last_node, "");
-        __phase_t expect = __old_phase;
-        if (__current == __last_node && (__current_expected & 1))
-        {
-          if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
-                expect, __full_step, memory_order_acq_rel))
-          {
-            break; // I'm 1 in 1, go to next __round
-          }
-          _CCCL_ASSERT(expect == __full_step, "");
-        }
-        else if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
-                   expect, __half_step, memory_order_acq_rel))
-        {
-          return false; // I'm 1 in 2, done with arrival
-        }
-        else if (expect == __half_step)
-        {
-          if (__state[__current].__tickets[__round].__phase.compare_exchange_strong(
-                expect, __full_step, memory_order_acq_rel))
-          {
-            break; // I'm 2 in 2, go to next __round
-          }
-          _CCCL_ASSERT(expect == __full_step, "");
-        }
-        _CCCL_ASSERT(__round == 0 && expect == __full_step, "");
-      }
-      __current_expected = (__current_expected >> 1) + (__current_expected & 1);
-      __current &= ~(1 << __round);
-      __last_node &= ~(1 << __round);
-    }
-  }
-
-public:
-  using arrival_token = __phase_t;
-
-  _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
-      : __expected(__expected)
-      , __expected_adjustment(0)
-      , __completion(__completion)
-      , __phase(0)
-      , __state((__expected + 1) >> 1)
-  {
-    _CCCL_ASSERT(__expected >= 0, "");
-  }
-
-  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
-
-  __barrier_base(__barrier_base const&)            = delete;
-  __barrier_base& operator=(__barrier_base const&) = delete;
-
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t update = 1)
-  {
-    _CCCL_ASSERT(update > 0, "");
-    auto __old_phase = __phase.load(memory_order_relaxed);
-    for (; update; --update)
-    {
-      if (__arrive(__old_phase))
-      {
-        __completion();
-        __expected += __expected_adjustment.load(memory_order_relaxed);
-        __expected_adjustment.store(0, memory_order_relaxed);
-        __phase.store(__old_phase + 2, memory_order_release);
-      }
-    }
-    return __old_phase;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const
-  {
-    __libcpp_thread_poll_with_backoff([=]() -> bool {
-      return __phase.load(memory_order_acquire) != __old_phase;
-    });
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
-  {
-    wait(arrive());
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
-  {
-    __expected_adjustment.fetch_sub(1, memory_order_relaxed);
-    (void) arrive();
-  }
-};
-
-#else
-
-#  if _LIBCUDACXX_CUDA_ABI_VERSION < 3
-#    define _LIBCUDACXX_BARRIER_ALIGNMENTS alignas(64)
-#  else
-#    define _LIBCUDACXX_BARRIER_ALIGNMENTS
-#  endif
-
-template <class _Barrier>
-class __barrier_poll_tester_phase
-{
-  _Barrier const* __this;
-  typename _Barrier::arrival_token __phase;
-
-public:
-  _LIBCUDACXX_HIDE_FROM_ABI
-  __barrier_poll_tester_phase(_Barrier const* __this_, typename _Barrier::arrival_token&& __phase_)
-      : __this(__this_)
-      , __phase(_CUDA_VSTD::move(__phase_))
-  {}
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
-  {
-    return __this->__try_wait(__phase);
-  }
-};
-
-template <class _Barrier>
-class __barrier_poll_tester_parity
-{
-  _Barrier const* __this;
-  bool __parity;
-
-public:
-  _LIBCUDACXX_HIDE_FROM_ABI __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_)
-      : __this(__this_)
-      , __parity(__parity_)
-  {}
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
-  {
-    return __this->__try_wait_parity(__parity);
-  }
-};
-
-template <class _Barrier>
-_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase)
-{
-  return __b.__try_wait(_CUDA_VSTD::move(__phase));
-}
-
-template <class _Barrier>
-_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait_parity(const _Barrier& __b, bool __parity)
-{
-  return __b.__try_wait_parity(__parity);
-}
-
-template <class _CompletionF, thread_scope _Sco = thread_scope_system>
-class __barrier_base
-{
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
-  _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco> __phase;
-
-public:
-  using arrival_token = bool;
-
-private:
-  template <typename _Barrier>
-  friend class __barrier_poll_tester_phase;
-  template <typename _Barrier>
-  friend class __barrier_poll_tester_parity;
-  template <typename _Barrier>
-  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
-  template <typename _Barrier>
-  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
-
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
-  {
-    return __phase.load(memory_order_acquire) != __old;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
-  {
-    return __try_wait(__parity);
-  }
-
-public:
-  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
-
-  _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
-      : __expected(__expected)
-      , __arrived(__expected)
-      , __completion(__completion)
-      , __phase(false)
-  {}
-
-  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
-
-  __barrier_base(__barrier_base const&)            = delete;
-  __barrier_base& operator=(__barrier_base const&) = delete;
-
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
-  {
-    auto const __old_phase    = __phase.load(memory_order_relaxed);
-    auto const __result       = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
-    auto const __new_expected = __expected.load(memory_order_relaxed);
-
-    _CCCL_ASSERT(__result >= 0, "");
-
-    if (0 == __result)
-    {
-      __completion();
-      __arrived.store(__new_expected, memory_order_relaxed);
-      __phase.store(!__old_phase, memory_order_release);
-      __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
-    }
-    return __old_phase;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const
-  {
-    __phase.wait(__old_phase, memory_order_acquire);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
-  {
-    wait(arrive());
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
-  {
-    __expected.fetch_sub(1, memory_order_relaxed);
-    (void) arrive();
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
-  {
-    return numeric_limits<ptrdiff_t>::max();
-  }
-};
-
-template <thread_scope _Sco>
-class __barrier_base<__empty_completion, _Sco>
-{
-  static constexpr uint64_t __expected_unit = 1ull;
-  static constexpr uint64_t __arrived_unit  = 1ull << 32;
-  static constexpr uint64_t __expected_mask = __arrived_unit - 1;
-  static constexpr uint64_t __phase_bit     = 1ull << 63;
-  static constexpr uint64_t __arrived_mask  = (__phase_bit - 1) & ~__expected_mask;
-
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
-
-public:
-  using arrival_token = uint64_t;
-
-private:
-  template <typename _Barrier>
-  friend class __barrier_poll_tester_phase;
-  template <typename _Barrier>
-  friend class __barrier_poll_tester_parity;
-  template <typename _Barrier>
-  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
-  template <typename _Barrier>
-  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
-
-  static _LIBCUDACXX_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) noexcept
-  {
-#  if _CCCL_STD_VER > 2011
-    // This debug assert is not supported in C++11 due to resulting in a
-    // multi-statement constexpr function.
-    _CCCL_ASSERT(__count >= 0, "Count must be non-negative.");
-#  endif // _CCCL_STD_VER > 2011
-    return (((1u << 31) - __count) << 32) | ((1u << 31) - __count);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_phase(uint64_t __phase) const
-  {
-    uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
-    return ((__current & __phase_bit) != __phase);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
-  {
-    return __try_wait_phase(__old & __phase_bit);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
-  {
-    return __try_wait_phase(__parity ? __phase_bit : 0);
-  }
-
-public:
-  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
-
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
-  __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
-      : __phase_arrived_expected(__init(__count))
-  {
-    _CCCL_ASSERT(__count >= 0, "");
-  }
-
-  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
-
-  __barrier_base(__barrier_base const&)            = delete;
-  __barrier_base& operator=(__barrier_base const&) = delete;
-
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
-  {
-    auto const __inc = __arrived_unit * __update;
-    auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
-    if ((__old ^ (__old + __inc)) & __phase_bit)
-    {
-      __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed);
-      __phase_arrived_expected.notify_all();
-    }
-    return __old & __phase_bit;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
-  {
-    __libcpp_thread_poll_with_backoff(__barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const
-  {
-    __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
-  {
-    wait(arrive());
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
-  {
-    __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
-    (void) arrive();
-  }
-
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
-  {
-    return numeric_limits<int32_t>::max();
-  }
-};
-
-#endif //_LIBCUDACXX_HAS_NO_TREE_BARRIER
-
-template <class _CompletionF = __empty_completion>
-class barrier : public __barrier_base<_CompletionF>
-{
-public:
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
-      : __barrier_base<_CompletionF>(__count, __completion)
-  {}
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#include <cuda/std/__cuda/barrier.h>
-
-_CCCL_POP_MACROS
-
-#endif //_LIBCUDACXX_BARRIER

From 8aaeb2959b77fd8e27172737b21dd9247904b511 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Mon, 7 Oct 2024 18:37:22 -0400
Subject: [PATCH 4/9] Consolidate header testing infra. (#2460)

---
 CMakeLists.txt                                |   1 +
 cmake/CCCLGenerateHeaderTests.cmake           | 111 ++++++++++++++++++
 .../header_test.in => cmake/header_test.cu.in |  59 +++++-----
 cub/cmake/CubHeaderTesting.cmake              |  24 ++--
 cub/cmake/header_test.in                      |  72 ------------
 cudax/cmake/cudaxHeaderTesting.cmake          |  38 +++---
 cudax/cmake/header_test.in.cu                 |  20 +++-
 thrust/cmake/ThrustHeaderTesting.cmake        |  94 +++++++--------
 8 files changed, 226 insertions(+), 193 deletions(-)
 create mode 100644 cmake/CCCLGenerateHeaderTests.cmake
 rename thrust/cmake/header_test.in => cmake/header_test.cu.in (52%)
 delete mode 100644 cub/cmake/header_test.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f87ad146174..5e9f68c14eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,7 @@ if (CCCL_TOPLEVEL_PROJECT)
   include(cmake/CCCLBuildCompilerTargets.cmake)
   include(cmake/CCCLClangdCompileInfo.cmake)
   include(cmake/CCCLConfigureTarget.cmake)
+  include(cmake/CCCLGenerateHeaderTests.cmake)
   include(cmake/CCCLGetDependencies.cmake)
 
   cccl_build_compiler_targets()
diff --git a/cmake/CCCLGenerateHeaderTests.cmake b/cmake/CCCLGenerateHeaderTests.cmake
new file mode 100644
index 00000000000..e483b194513
--- /dev/null
+++ b/cmake/CCCLGenerateHeaderTests.cmake
@@ -0,0 +1,111 @@
+# Usage:
+# cccl_generate_header_tests(<target_name> <project_include_path>
+#                            [cccl_configure_target options]
+#                            [LANGUAGE <CXX|CUDA>]
+#                            [HEADER_TEMPLATE <template>]
+#                            [GLOBS <glob1> [glob2 ...]]
+#                            [EXCLUDES <glob1> [glob2 ...]]
+#                            [HEADERS <header1> [header2 ...]]
+# )
+#
+# Options:
+# target_name: The name of the meta-target that will build this set of header tests.
+# project_include_path: The path to the project's include directory, relative to <CCCL_SOURCE_DIR>.
+# cccl_configure_target options: Options to pass to cccl_configure_target. Must appear before any other named arguments.
+# LANGUAGE: The language to use for the header tests. Defaults to CUDA.
+# HEADER_TEMPLATE: A file that will be used as a template for each header test. The template will be configured for each header.
+# GLOBS: All files that match these globbing patterns will be included in the header tests, unless they also match EXCLUDES.
+# EXCLUDES: Files that match these globbing patterns will be excluded from the header tests.
+# HEADERS: An explicit list of headers to include in the header tests.
+#
+# Notes:
+# - The header globs are applied relative to <project_include_path>.
+# - If no HEADER_TEMPLATE is provided, a default template will be used.
+# - The HEADER_TEMPLATE will be configured for each header, with the following variables:
+#   - @header@: The path to the target header, relative to <project_include_path>.
+function(cccl_generate_header_tests target_name project_include_path)
+  set(options)
+  set(oneValueArgs LANGUAGE HEADER_TEMPLATE)
+  set(multiValueArgs GLOBS EXCLUDES HEADERS DEFINES)
+  cmake_parse_arguments(CGHT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # Setup defaults
+  if (NOT DEFINED CGHT_LANGUAGE)
+    set(CGHT_LANGUAGE CUDA)
+  endif()
+
+  if (NOT DEFINED CGHT_HEADER_TEMPLATE)
+    set(CGHT_HEADER_TEMPLATE "${CCCL_SOURCE_DIR}/cmake/header_test.cu.in")
+  endif()
+
+  # Derived vars:
+  if (${CGHT_LANGUAGE} STREQUAL "CXX")
+    set(extension "cpp")
+  elseif(${CGHT_LANGUAGE} STREQUAL "CUDA")
+    set(extension "cu")
+  else()
+    message(FATAL_ERROR "Unsupported language: ${CGHT_LANGUAGE}")
+  endif()
+
+  set(cccl_configure_target_options ${CGHT_UNPARSED_ARGUMENTS})
+  set(base_path "${CCCL_SOURCE_DIR}/${project_include_path}")
+
+  # Prepend the basepath to all globbing expressions:
+  if (DEFINED CGHT_GLOBS)
+    set(globs)
+    foreach (glob IN LISTS CGHT_GLOBS)
+      list(APPEND globs "${base_path}/${glob}")
+    endforeach()
+    set(CGHT_GLOBS ${globs})
+  endif()
+  if (DEFINED CGHT_EXCLUDES)
+    set(excludes)
+    foreach (exclude IN LISTS CGHT_EXCLUDES)
+      list(APPEND excludes "${base_path}/${exclude}")
+    endforeach()
+    set(CGHT_EXCLUDES ${excludes})
+  endif()
+
+  # Determine header list
+  set(headers)
+
+  # Add globs:
+  if (DEFINED CGHT_GLOBS)
+    file(GLOB_RECURSE headers
+      RELATIVE "${base_path}"
+      CONFIGURE_DEPENDS
+      ${CGHT_GLOBS}
+    )
+  endif()
+
+  # Remove excludes:
+  if (DEFINED CGHT_EXCLUDES)
+    file(GLOB_RECURSE header_excludes
+      RELATIVE "${base_path}"
+      CONFIGURE_DEPENDS
+      ${CGHT_EXCLUDES}
+    )
+    list(REMOVE_ITEM headers ${header_excludes})
+  endif()
+
+  # Add explicit headers:
+  if (DEFINED CGHT_HEADERS)
+    list(APPEND headers ${CGHT_HEADERS})
+  endif()
+
+  # Cleanup:
+  list(REMOVE_DUPLICATES headers)
+
+  # Configure header templates:
+  set(header_srcs)
+  foreach (header IN LISTS headers)
+    set(header_src "${CMAKE_CURRENT_BINARY_DIR}/headers/${target_name}/${header}.${extension}")
+    configure_file(${CGHT_HEADER_TEMPLATE} ${header_src} @ONLY)
+    list(APPEND header_srcs ${header_src})
+  endforeach()
+
+  # Object library that compiles each header:
+  add_library(${target_name} OBJECT ${header_srcs})
+  cccl_configure_target(${target_name} ${cccl_configure_target_options})
+
+endfunction()
diff --git a/thrust/cmake/header_test.in b/cmake/header_test.cu.in
similarity index 52%
rename from thrust/cmake/header_test.in
rename to cmake/header_test.cu.in
index 292fe16526c..051cecb18aa 100644
--- a/thrust/cmake/header_test.in
+++ b/cmake/header_test.cu.in
@@ -1,20 +1,14 @@
 // This source file checks that:
-// 1) Header <thrust/${header}> compiles without error.
+// 1) Header <@header@> compiles without error.
 // 2) Common macro collisions with platform/system headers are avoided.
+// 3) half/bf16 aren't included when these are explicitly disabled.
 
-// Turn off failures for certain configurations:
-#define THRUST_CPP11_REQUIRED_NO_ERROR
-#define THRUST_CPP14_REQUIRED_NO_ERROR
-#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR
-
-#ifndef THRUST_IGNORE_MACRO_CHECKS
-
-// Define THRUST_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// Define CCCL_HEADER_MACRO_CHECK(macro, header), which emits a diagnostic indicating
 // a potential macro collision and halts.
 //
 // Hacky way to build a string, but it works on all tested platforms.
-#define THRUST_MACRO_CHECK(MACRO, HEADER)                                      \
-  THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
+#define CCCL_HEADER_MACRO_CHECK(MACRO, HEADER)                                      \
+  CCCL_HEADER_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
                           headers due to conflicts with HEADER macros.)
 
 // Use raw platform macros instead of the CCCL macros since we
@@ -24,34 +18,37 @@
 #if defined(_MSC_VER) // MSVC
 
 // Fake up an error for MSVC
-#define THRUST_MACRO_CHECK_IMPL(msg)                                           \
-  /* Print message that looks like an error: */                                \
-  __pragma(message(__FILE__ ":" THRUST_MACRO_CHECK_IMPL0(__LINE__)             \
-                   ": error: " #msg))                                          \
-  /* abort compilation due to static_assert or syntax error: */                \
+#define CCCL_HEADER_MACRO_CHECK_IMPL(msg)                                           \
+  /* Print message that looks like an error: */                                     \
+  __pragma(message(__FILE__ ":" CCCL_HEADER_MACRO_CHECK_IMPL0(__LINE__)             \
+                   ": error: " #msg))                                               \
+  /* abort compilation due to static_assert or syntax error: */                     \
   static_assert(false, #msg);
-#define THRUST_MACRO_CHECK_IMPL0(x) THRUST_MACRO_CHECK_IMPL1(x)
-#define THRUST_MACRO_CHECK_IMPL1(x) #x
+#define CCCL_HEADER_MACRO_CHECK_IMPL0(x) CCCL_HEADER_MACRO_CHECK_IMPL1(x)
+#define CCCL_HEADER_MACRO_CHECK_IMPL1(x) #x
 
 #elif defined(__clang__) || defined(__GNUC__)
 
 // GCC/clang are easy:
-#define THRUST_MACRO_CHECK_IMPL(msg) THRUST_MACRO_CHECK_IMPL0(GCC error #msg)
-#define THRUST_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+#define CCCL_HEADER_MACRO_CHECK_IMPL(msg) CCCL_HEADER_MACRO_CHECK_IMPL0(GCC error #msg)
+#define CCCL_HEADER_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif // msvc vs. the world
 
-#endif
+// May be defined to skip macro check for certain configurations.
+#ifndef CCCL_IGNORE_HEADER_MACRO_CHECKS
 
 // complex.h conflicts
-#define I THRUST_MACRO_CHECK('I', complex.h)
+#define I CCCL_HEADER_MACRO_CHECK('I', complex.h)
 
 // windows.h conflicts
-#define small THRUST_MACRO_CHECK('small', windows.h)
+#define small CCCL_HEADER_MACRO_CHECK('small', windows.h)
 // We can't enable these checks without breaking some builds -- some standard
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
 // Leaving these commented out as a warning: Here be dragons.
-//#define min(...) THRUST_MACRO_CHECK('min', windows.h)
-//#define max(...) THRUST_MACRO_CHECK('max', windows.h)
+//#define min(...) CCCL_HEADER_MACRO_CHECK('min', windows.h)
+//#define max(...) CCCL_HEADER_MACRO_CHECK('max', windows.h)
 
 #ifdef _WIN32
 // On Windows, make sure any include of Windows.h (e.g. via NVTX) does not define the checked macros
@@ -59,23 +56,23 @@
 #endif // _WIN32
 
 // termios.h conflicts (NVIDIA/thrust#1547)
-#define B0 THRUST_MACRO_CHECK("B0", termios.h)
+#define B0 CCCL_HEADER_MACRO_CHECK("B0", termios.h)
 
-#endif // THRUST_IGNORE_MACRO_CHECKS
+#endif // CCCL_IGNORE_HEADER_MACRO_CHECKS
 
-#include <thrust/${header}>
+#include <@header@>
 
 #if defined(CCCL_DISABLE_BF16_SUPPORT)
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-#error Thrust should not include cuda_bf16.h when BF16 support is disabled
+#error We should not include cuda_bf16.h when BF16 support is disabled
 #endif // __CUDA_BF16_TYPES_EXIST__
 #endif // CCCL_DISABLE_BF16_SUPPORT
 
 #if defined(CCCL_DISABLE_FP16_SUPPORT)
 #if defined(__CUDA_FP16_TYPES_EXIST__)
-#error Thrust should not include cuda_fp16.h when half support is disabled
+#error We should not include cuda_fp16.h when half support is disabled
 #endif // __CUDA_FP16_TYPES_EXIST__
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-#error Thrust should not include cuda_bf16.h when half support is disabled
+#error We should not include cuda_bf16.h when half support is disabled
 #endif // __CUDA_BF16_TYPES_EXIST__
 #endif // CCCL_DISABLE_FP16_SUPPORT
diff --git a/cub/cmake/CubHeaderTesting.cmake b/cub/cmake/CubHeaderTesting.cmake
index fdf9be3be48..bf2cf6e2e99 100644
--- a/cub/cmake/CubHeaderTesting.cmake
+++ b/cub/cmake/CubHeaderTesting.cmake
@@ -7,25 +7,17 @@
 # Meta target for all configs' header builds:
 add_custom_target(cub.all.headers)
 
-file(GLOB_RECURSE headers
-  RELATIVE "${CUB_SOURCE_DIR}/cub"
-  CONFIGURE_DEPENDS
-  cub/*.cuh
-)
-
-set(headertest_srcs)
-foreach (header IN LISTS headers)
-  set(headertest_src "headers/${header}.cu")
-  configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
-  list(APPEND headertest_srcs "${headertest_src}")
-endforeach()
-
 function(cub_add_header_test label definitions)
   foreach(cub_target IN LISTS CUB_TARGETS)
+    cub_get_target_property(config_dialect ${cub_target} DIALECT)
     cub_get_target_property(config_prefix ${cub_target} PREFIX)
 
     set(headertest_target ${config_prefix}.headers.${label})
-    add_library(${headertest_target} OBJECT ${headertest_srcs})
+
+    cccl_generate_header_tests(${headertest_target} cub
+      DIALECT ${config_dialect}
+      GLOBS "cub/*.cuh"
+    )
     target_link_libraries(${headertest_target} PUBLIC ${cub_target})
     target_compile_definitions(${headertest_target} PRIVATE ${definitions})
     cub_clone_target_properties(${headertest_target} ${cub_target})
@@ -47,11 +39,11 @@ set(header_definitions
   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
   "CUB_WRAPPED_NAMESPACE=wrapped_cub"
   "CCCL_DISABLE_BF16_SUPPORT")
-cub_add_header_test(bf16 "${header_definitions}")
+cub_add_header_test(no_bf16 "${header_definitions}")
 
 # Check that half support can be disabled
 set(header_definitions
   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
   "CUB_WRAPPED_NAMESPACE=wrapped_cub"
   "CCCL_DISABLE_FP16_SUPPORT")
-cub_add_header_test(half "${header_definitions}")
+cub_add_header_test(no_half "${header_definitions}")
diff --git a/cub/cmake/header_test.in b/cub/cmake/header_test.in
deleted file mode 100644
index 300fa6abb94..00000000000
--- a/cub/cmake/header_test.in
+++ /dev/null
@@ -1,72 +0,0 @@
-// This source file checks that:
-// 1) Header <cub/${header}> compiles without error.
-// 2) Common macro collisions with platform/system headers are avoided.
-
-// Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating
-// a potential macro collision and halts.
-//
-// Use raw platform macros instead of the CCCL macros since we
-// don't want to #include any headers other than the one being tested.
-//
-// This is only implemented for MSVC/GCC/Clang.
-#if defined(_MSC_VER) // MSVC
-
-// Fake up an error for MSVC
-#define CUB_MACRO_CHECK_IMPL(msg)                                              \
-  /* Print message that looks like an error: */                                \
-  __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__)                \
-                   ": error: " #msg))                                          \
-  /* abort compilation due to static_assert or syntax error: */                \
-  static_assert(false, #msg);
-#define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x)
-#define CUB_MACRO_CHECK_IMPL1(x) #x
-
-#elif defined(__clang__) || defined(__GNUC__)
-
-// GCC/clang are easy:
-#define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg)
-#define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
-
-#endif
-
-// Hacky way to build a string, but it works on all tested platforms.
-#define CUB_MACRO_CHECK(MACRO, HEADER)                                         \
-  CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB            \
-                       headers due to conflicts with HEADER macros.)
-
-// complex.h conflicts
-#define I CUB_MACRO_CHECK('I', complex.h)
-
-// windows.h conflicts
-#define small CUB_MACRO_CHECK('small', windows.h)
-// We can't enable these checks without breaking some builds -- some standard
-// library implementations unconditionally `#undef` these macros, which then
-// causes random failures later.
-// Leaving these commented out as a warning: Here be dragons.
-//#define min(...) CUB_MACRO_CHECK('min', windows.h)
-//#define max(...) CUB_MACRO_CHECK('max', windows.h)
-
-#ifdef _WIN32
-// On Windows, make sure any include of Windows.h (e.g. via NVTX) does not define the checked macros
-#  define WIN32_LEAN_AND_MEAN
-#endif // _WIN32
-
-// termios.h conflicts (NVIDIA/thrust#1547)
-#define B0 CUB_MACRO_CHECK("B0", termios.h)
-
-#include <cub/${header}>
-
-#if defined(CCCL_DISABLE_BF16_SUPPORT)
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-#error CUB should not include cuda_bf16.h when BF16 support is disabled
-#endif // __CUDA_BF16_TYPES_EXIST__
-#endif // CCCL_DISABLE_BF16_SUPPORT
-
-#if defined(CCCL_DISABLE_FP16_SUPPORT)
-#if defined(__CUDA_FP16_TYPES_EXIST__)
-#error CUB should not include cuda_fp16.h when half support is disabled
-#endif // __CUDA_FP16_TYPES_EXIST__
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-#error CUB should not include cuda_bf16.h when half support is disabled
-#endif // __CUDA_BF16_TYPES_EXIST__
-#endif // CCCL_DISABLE_FP16_SUPPORT
diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
index 824b1a4fda7..6c81b529525 100644
--- a/cudax/cmake/cudaxHeaderTesting.cmake
+++ b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -7,35 +7,27 @@
 # Meta target for all configs' header builds:
 add_custom_target(cudax.all.headers)
 
-file(GLOB_RECURSE headers
-  RELATIVE "${cudax_SOURCE_DIR}/include"
-  CONFIGURE_DEPENDS1
-  "${cudax_SOURCE_DIR}/include/*.cuh"
-  "${cudax_SOURCE_DIR}/include/*.h"
-)
-
-# The following internal headers are not required to compile independently:
-list(REMOVE_ITEM headers
-  "cuda/experimental/__async/prologue.cuh"
-  "cuda/experimental/__async/epilogue.cuh"
-)
-
-set(headertest_srcs)
-foreach (header IN LISTS headers)
-  set(headertest_src "headers/${header}.cu")
-  configure_file("${cudax_SOURCE_DIR}/cmake/header_test.in.cu" "${headertest_src}")
-  list(APPEND headertest_srcs "${headertest_src}")
-endforeach()
-
 function(cudax_add_header_test label definitions)
   foreach(cn_target IN LISTS cudax_TARGETS)
+    cudax_get_target_property(config_dialect ${cn_target} DIALECT)
     cudax_get_target_property(config_prefix ${cn_target} PREFIX)
 
     set(headertest_target ${config_prefix}.headers.${label})
-    add_library(${headertest_target} OBJECT ${headertest_srcs})
+    cccl_generate_header_tests(${headertest_target} cudax/include
+      DIALECT ${config_dialect}
+      # The cudax header template removes the check for the `small` macro.
+      HEADER_TEMPLATE "${cudax_SOURCE_DIR}/cmake/header_test.in.cu"
+      GLOBS "cuda/experimental/*.cuh"
+      EXCLUDES
+        # The following internal headers are not required to compile independently:
+        "cuda/experimental/__async/prologue.cuh"
+        "cuda/experimental/__async/epilogue.cuh"
+    )
     target_link_libraries(${headertest_target} PUBLIC ${cn_target})
-    target_compile_definitions(${headertest_target} PRIVATE ${definitions}
-                               "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE")
+    target_compile_definitions(${headertest_target} PRIVATE
+      ${definitions}
+      "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+    )
     cudax_clone_target_properties(${headertest_target} ${cn_target})
 
     add_dependencies(cudax.all.headers ${headertest_target})
diff --git a/cudax/cmake/header_test.in.cu b/cudax/cmake/header_test.in.cu
index fd2df1987dc..1e31b2e63df 100644
--- a/cudax/cmake/header_test.in.cu
+++ b/cudax/cmake/header_test.in.cu
@@ -1,6 +1,7 @@
 // This source file checks that:
-// 1) Header <${header}> compiles without error.
+// 1) Header <@header@> compiles without error.
 // 2) Common macro collisions with platform/system headers are avoided.
+// 3) half/bf16 aren't included when these are explicitly disabled.
 
 // Define CUDAX_MACRO_CHECK(macro, header), which emits a diagnostic indicating
 // a potential macro collision and halts.
@@ -47,4 +48,19 @@
 // termios.h conflicts (NVIDIA/thrust#1547)
 #define B0 CUDAX_MACRO_CHECK("B0", termios.h)
 
-#include <${header}>
+#include <@header@>
+
+#if defined(CCCL_DISABLE_BF16_SUPPORT)
+#  if defined(__CUDA_BF16_TYPES_EXIST__)
+#    error We should not include cuda_bf16.h when BF16 support is disabled
+#  endif // __CUDA_BF16_TYPES_EXIST__
+#endif // CCCL_DISABLE_BF16_SUPPORT
+
+#if defined(CCCL_DISABLE_FP16_SUPPORT)
+#  if defined(__CUDA_FP16_TYPES_EXIST__)
+#    error We should not include cuda_fp16.h when half support is disabled
+#  endif // __CUDA_FP16_TYPES_EXIST__
+#  if defined(__CUDA_BF16_TYPES_EXIST__)
+#    error We should not include cuda_bf16.h when half support is disabled
+#  endif // __CUDA_BF16_TYPES_EXIST__
+#endif // CCCL_DISABLE_FP16_SUPPORT
diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
index 5a10912864b..d90521c8d7d 100644
--- a/thrust/cmake/ThrustHeaderTesting.cmake
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -10,12 +10,19 @@ add_custom_target(thrust.all.headers)
 function(thrust_add_header_test thrust_target label definitions)
   thrust_get_target_property(config_host ${thrust_target} HOST)
   thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_dialect ${thrust_target} DIALECT)
   thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
   set(config_systems ${config_host} ${config_device})
 
   string(TOLOWER "${config_host}" host_lower)
   string(TOLOWER "${config_device}" device_lower)
 
+  if (config_device STREQUAL "CUDA")
+    set(lang CUDA)
+  else()
+    set(lang CXX)
+  endif()
+
   # GLOB ALL THE THINGS
   set(headers_globs thrust/*.h)
   set(headers_exclude_systems_globs thrust/system/*/*)
@@ -31,14 +38,14 @@ function(thrust_add_header_test thrust_target label definitions)
 
   # Get all .h files...
   file(GLOB_RECURSE headers
-    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    RELATIVE "${Thrust_SOURCE_DIR}"
     CONFIGURE_DEPENDS
     ${headers_globs}
   )
 
   # ...then remove all system specific headers...
   file(GLOB_RECURSE headers_exclude_systems
-    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    RELATIVE "${Thrust_SOURCE_DIR}"
     CONFIGURE_DEPENDS
     ${headers_exclude_systems_globs}
   )
@@ -46,7 +53,7 @@ function(thrust_add_header_test thrust_target label definitions)
 
   # ...then add all headers specific to the selected host and device systems back again...
   file(GLOB_RECURSE headers_systems
-    RELATIVE ${Thrust_SOURCE_DIR}/thrust
+    RELATIVE "${Thrust_SOURCE_DIR}"
     CONFIGURE_DEPENDS
     ${headers_systems_globs}
   )
@@ -54,7 +61,7 @@ function(thrust_add_header_test thrust_target label definitions)
 
   # ...and remove all the detail headers (also removing the detail headers from the selected systems).
   file(GLOB_RECURSE headers_exclude_details
-    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    RELATIVE "${Thrust_SOURCE_DIR}"
     CONFIGURE_DEPENDS
     ${headers_exclude_details_globs}
   )
@@ -62,14 +69,14 @@ function(thrust_add_header_test thrust_target label definitions)
 
   # List of headers that aren't implemented for all backends, but are implemented for CUDA.
   set(partially_implemented_CUDA
-    async/copy.h
-    async/for_each.h
-    async/reduce.h
-    async/scan.h
-    async/sort.h
-    async/transform.h
-    event.h
-    future.h
+    thrust/async/copy.h
+    thrust/async/for_each.h
+    thrust/async/reduce.h
+    thrust/async/scan.h
+    thrust/async/sort.h
+    thrust/async/transform.h
+    thrust/event.h
+    thrust/future.h
   )
 
   # List of headers that aren't implemented for all backends, but are implemented for CPP.
@@ -93,9 +100,10 @@ function(thrust_add_header_test thrust_target label definitions)
   )
   list(REMOVE_DUPLICATES partially_implemented)
 
-  set(headertest_srcs)
-
-  foreach (header IN LISTS headers)
+  # Filter the partially implemented headers:
+  set(headers_tmp ${headers})
+  set(headers)
+  foreach (header IN LISTS headers_tmp)
     if ("${header}" IN_LIST partially_implemented)
       # This header is partially implemented on _some_ backends...
       if (NOT "${header}" IN_LIST partially_implemented_${config_device})
@@ -103,22 +111,22 @@ function(thrust_add_header_test thrust_target label definitions)
         continue()
       endif()
     endif()
-
-    set(headertest_src_ext .cpp)
-    if ("CUDA" STREQUAL "${config_device}")
-      set(headertest_src_ext .cu)
-    endif()
-
-    set(headertest_src "headers/${config_prefix}/${header}${headertest_src_ext}")
-    configure_file("${Thrust_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
-
-    list(APPEND headertest_srcs "${headertest_src}")
+    list(APPEND headers ${header})
   endforeach()
 
   set(headertest_target ${config_prefix}.headers.${label})
-  add_library(${headertest_target} OBJECT ${headertest_srcs})
+  cccl_generate_header_tests(${headertest_target} thrust
+    DIALECT ${config_dialect}
+    LANGUAGE ${lang}
+    HEADERS ${headers}
+  )
   target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
-  target_compile_definitions(${headertest_target} PRIVATE ${header_definitions})
+  target_compile_definitions(${headertest_target} PRIVATE
+    ${header_definitions}
+    "THRUST_CPP11_REQUIRED_NO_ERROR"
+    "THRUST_CPP14_REQUIRED_NO_ERROR"
+    "THRUST_MODERN_GCC_REQUIRED_NO_ERROR"
+  )
   thrust_clone_target_properties(${headertest_target} ${thrust_target})
 
   if ("CUDA" STREQUAL "${config_device}")
@@ -128,9 +136,7 @@ function(thrust_add_header_test thrust_target label definitions)
   # Disable macro checks on TBB; the TBB atomic implementation uses `I` and
   # our checks will issue false errors.
   if ("TBB" IN_LIST config_systems)
-    target_compile_definitions(${headertest_target}
-      PRIVATE THRUST_IGNORE_MACRO_CHECKS
-    )
+    target_compile_definitions(${headertest_target} PRIVATE CCCL_IGNORE_HEADER_MACRO_CHECKS)
   endif()
 
   thrust_fix_clang_nvcc_build_for(${headertest_target})
@@ -140,39 +146,29 @@ function(thrust_add_header_test thrust_target label definitions)
 endfunction()
 
 foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_add_header_test(${thrust_target} base "")
+
   # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
   set(header_definitions
     "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
     "CUB_WRAPPED_NAMESPACE=wrapped_cub")
-  thrust_add_header_test(${thrust_target} base "${header_definitions}")
+  thrust_add_header_test(${thrust_target} wrap "${header_definitions}")
 
   # We need to ensure that the different dispatch mechanisms work
-  set(header_definitions
-    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
-    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
-    "THRUST_FORCE_32_BIT_OFFSET_TYPE")
+  set(header_definitions "THRUST_FORCE_32_BIT_OFFSET_TYPE")
   thrust_add_header_test(${thrust_target} offset_32 "${header_definitions}")
 
-  set(header_definitions
-    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
-    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
-    "THRUST_FORCE_64_BIT_OFFSET_TYPE")
+  set(header_definitions "THRUST_FORCE_64_BIT_OFFSET_TYPE")
   thrust_add_header_test(${thrust_target} offset_64 "${header_definitions}")
 
   thrust_get_target_property(config_device ${thrust_target} DEVICE)
   if ("CUDA" STREQUAL "${config_device}")
     # Check that BF16 support can be disabled
-    set(header_definitions
-      "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
-      "CUB_WRAPPED_NAMESPACE=wrapped_cub"
-      "CCCL_DISABLE_BF16_SUPPORT")
-    thrust_add_header_test(${thrust_target} bf16 "${header_definitions}")
+    set(header_definitions "CCCL_DISABLE_BF16_SUPPORT")
+    thrust_add_header_test(${thrust_target} no_bf16 "${header_definitions}")
 
     # Check that half support can be disabled
-    set(header_definitions
-      "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
-      "CUB_WRAPPED_NAMESPACE=wrapped_cub"
-      "CCCL_DISABLE_FP16_SUPPORT")
-    thrust_add_header_test(${thrust_target} half "${header_definitions}")
+    set(header_definitions "CCCL_DISABLE_FP16_SUPPORT")
+    thrust_add_header_test(${thrust_target} no_half "${header_definitions}")
   endif()
 endforeach ()

From ee5dd3eb60706dff3d94299cd244eb596a99466e Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Tue, 8 Oct 2024 11:15:06 -0700
Subject: [PATCH 5/9] Add ForEachN from CUB to cccl/c. (#2378)

* Work in progress

* Initial draft of exposing CUB::for in cccl/c.

* Add a fairly cursed storage abstraction for kernel arguments.

* Fix cccl/c include path.

* Commit WIP

* Make reduce inline functions static.

* Initial draft of iterator support for C device for

* Remove small vector (for now)

* Fixup and remove debug build option from for.cu.

* Disable iterator support in foreach (for now)

* whitespace fixup

* Restore LDL/STL call ABI checks that were commented out earlier.

Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com>

* Fix missing context push, Fix terminating instead of returning error.

* alignof instead of signof, use `value_t` for value type of kernel.

* Fix missing header in for.cu

---------

Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com>
---
 c/CMakeLists.txt            |   7 +-
 c/include/cccl/c/for.h      |  50 +++++++
 c/src/for.cu                | 230 +++++++++++++++++++++++++++++++
 c/src/for/for_op_helper.cpp | 261 ++++++++++++++++++++++++++++++++++++
 c/src/for/for_op_helper.h   |  53 ++++++++
 c/src/reduce.cu             | 183 ++-----------------------
 c/src/util/context.cpp      |  31 +++++
 c/src/util/context.h        |  18 +++
 c/src/util/errors.cpp       |  39 ++++++
 c/src/util/errors.h         |  20 +++
 c/src/util/types.cpp        | 134 ++++++++++++++++++
 c/src/util/types.h          |  20 +++
 c/test/CMakeLists.txt       |   8 ++
 c/test/c2h.h                |  43 +++++-
 c/test/test_for.cpp         | 204 ++++++++++++++++++++++++++++
 c/test/test_reduce.cpp      |  14 +-
 16 files changed, 1134 insertions(+), 181 deletions(-)
 create mode 100644 c/include/cccl/c/for.h
 create mode 100644 c/src/for.cu
 create mode 100644 c/src/for/for_op_helper.cpp
 create mode 100644 c/src/for/for_op_helper.h
 create mode 100644 c/src/util/context.cpp
 create mode 100644 c/src/util/context.h
 create mode 100644 c/src/util/errors.cpp
 create mode 100644 c/src/util/errors.h
 create mode 100644 c/src/util/types.cpp
 create mode 100644 c/src/util/types.h
 create mode 100644 c/test/test_for.cpp

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 3e3783903bb..e9761c33f2d 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.30)
 
 project(cccl.c LANGUAGES CUDA CXX)
 
-add_library(cccl.c SHARED src/reduce.cu)
+add_library(cccl.c SHARED
+    src/reduce.cu src/for.cu
+    src/for/for_op_helper.cpp
+    src/util/errors.cpp src/util/types.cpp src/util/context.cpp)
+
 set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
 set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)
@@ -16,5 +20,6 @@ target_link_libraries(cccl.c PRIVATE CUDA::cudart
                                      CUDA::cuda_driver)
 target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
 target_include_directories(cccl.c PUBLIC "include")
+target_include_directories(cccl.c PRIVATE "src")
 
 add_subdirectory(test)
diff --git a/c/include/cccl/c/for.h b/c/include/cccl/c/for.h
new file mode 100644
index 00000000000..6a54be9b45b
--- /dev/null
+++ b/c/include/cccl/c/for.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  include <cuda.h>
+
+#  include <cccl/c/types.h>
+
+struct cccl_device_for_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  CUkernel static_kernel;
+};
+
+extern "C" CCCL_C_API CUresult cccl_device_for_build(
+  cccl_device_for_build_result_t* build,
+  cccl_iterator_t d_data,
+  cccl_op_t op,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_for(
+  cccl_device_for_build_result_t build,
+  cccl_iterator_t d_data,
+  int64_t num_items,
+  cccl_op_t op,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr);
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/src/for.cu b/c/src/for.cu
new file mode 100644
index 00000000000..2f46c3843d4
--- /dev/null
+++ b/c/src/for.cu
@@ -0,0 +1,230 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/util_device.cuh>
+
+#include <format>
+#include <iostream>
+#include <type_traits>
+
+#include <cccl/c/for.h>
+#include <cccl/c/types.h>
+#include <for/for_op_helper.h>
+#include <nvJitLink.h>
+#include <nvrtc.h>
+#include <util/context.h>
+#include <util/errors.h>
+#include <util/types.h>
+
+struct op_wrapper;
+struct device_reduce_policy;
+
+using OffsetT = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+static cudaError_t
+Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int cc, CUfunction static_kernel, CUstream stream)
+{
+  cudaError error = cudaSuccess;
+
+  if (num_items == 0)
+  {
+    return error;
+  }
+
+  auto for_kernel_state = make_for_kernel_state(op, d_in);
+
+  void* args[] = {&num_items, for_kernel_state.get()};
+
+  int thread_count = 256;
+  int block_count  = (num_items + 511) / 512;
+  check(cuLaunchKernel(static_kernel, block_count, 1, 1, thread_count, 1, 1, 0, stream, args, 0));
+
+  // Check for failure to launch
+  error = CubDebug(cudaPeekAtLastError());
+
+  return error;
+}
+
+struct for_each_wrapper;
+
+static std::string get_device_for_kernel_name()
+{
+  std::string offset_t;
+  std::string function_op_t;
+  check(nvrtcGetTypeName<for_each_wrapper>(&function_op_t));
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  return std::format("cub::detail::for_each::static_kernel<device_for_policy, {0}, {1}>", offset_t, function_op_t);
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for_build(
+  cccl_device_for_build_result_t* build,
+  cccl_iterator_t d_data,
+  cccl_op_t op,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    if (d_data.type == cccl_iterator_kind_t::iterator)
+    {
+      throw std::runtime_error(std::string("Iterators are unsupported in for_each currently"));
+    }
+
+    nvrtcProgram prog{};
+    const char* name = "test";
+
+    const int cc                     = cc_major * 10 + cc_minor;
+    const std::string d_data_value_t = cccl_type_enum_to_string(d_data.value_type.type);
+    const std::string offset_t       = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string for_kernel_name   = get_device_for_kernel_name();
+    const std::string device_for_kernel = get_for_kernel(op, d_data);
+
+    check(nvrtcCreateProgram(&prog, device_for_kernel.c_str(), name, 0, nullptr, nullptr));
+
+    check(nvrtcAddNameExpression(prog, for_kernel_name.c_str()));
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr int num_args = 7;
+    const char* args[]     = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    std::size_t log_size{};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args);
+
+    check(nvrtcGetProgramLogSize(prog, &log_size));
+    std::unique_ptr<char[]> log{new char[log_size]};
+    check(nvrtcGetProgramLog(prog, log.get()));
+
+    if (log_size > 1)
+    {
+      std::cerr << log.get() << std::endl;
+    }
+
+    std::string for_kernel_lowered_name;
+    {
+      const char* for_kernel_lowered_name_temp;
+      check(nvrtcGetLoweredName(prog, for_kernel_name.c_str(), &for_kernel_lowered_name_temp));
+      for_kernel_lowered_name = for_kernel_lowered_name_temp;
+    }
+
+    check(compile_result);
+
+    std::size_t ltoir_size{};
+    check(nvrtcGetLTOIRSize(prog, &ltoir_size));
+    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
+    check(nvrtcGetLTOIR(prog, ltoir.get()));
+    check(nvrtcDestroyProgram(&prog));
+
+    nvJitLinkHandle handle;
+    const char* lopts[] = {"-lto", arch.c_str()};
+
+    check(nvJitLinkCreate(&handle, 2, lopts));
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name));
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name));
+    if (cccl_iterator_kind_t::iterator == d_data.type)
+    {
+      check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, d_data.advance.ltoir, d_data.advance.ltoir_size, name));
+      check(
+        nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, d_data.dereference.ltoir, d_data.dereference.ltoir_size, name));
+    }
+
+    auto jitlink_error = nvJitLinkComplete(handle);
+
+    check(nvJitLinkGetErrorLogSize(handle, &log_size));
+    std::unique_ptr<char[]> jitlinklog{new char[log_size]};
+    check(nvJitLinkGetErrorLog(handle, jitlinklog.get()));
+
+    if (log_size > 1)
+    {
+      std::cerr << jitlinklog.get() << std::endl;
+    }
+
+    check(jitlink_error);
+
+    std::size_t cubin_size{};
+    check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
+    std::unique_ptr<char[]> cubin{new char[cubin_size]};
+    check(nvJitLinkGetLinkedCubin(handle, cubin.get()));
+    check(nvJitLinkDestroy(&handle));
+
+    cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->static_kernel, build->library, for_kernel_lowered_name.c_str()));
+
+    build->cc         = cc;
+    build->cubin      = cubin.release();
+    build->cubin_size = cubin_size;
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for(
+  cccl_device_for_build_result_t build,
+  cccl_iterator_t d_data,
+  int64_t num_items,
+  cccl_op_t op,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    pushed = try_push_context();
+    Invoke(d_data, num_items, op, build.cc, (CUfunction) build.static_kernel, stream);
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  if (pushed)
+  {
+    CUcontext dummy;
+    cuCtxPopCurrent(&dummy);
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr)
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+  }
+  catch (...)
+  {
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}
diff --git a/c/src/for/for_op_helper.cpp b/c/src/for/for_op_helper.cpp
new file mode 100644
index 00000000000..b7f4b1e8ae5
--- /dev/null
+++ b/c/src/for/for_op_helper.cpp
@@ -0,0 +1,261 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+#include <cstring>
+#include <format>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include <cccl/c/types.h>
+#include <for/for_op_helper.h>
+#include <util/types.h>
+
+static std::string get_for_kernel_iterator(cccl_iterator_t iter)
+{
+  const auto input_it_value_t = cccl_type_enum_to_string(iter.value_type.type);
+  const auto offset_t         = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+  constexpr std::string_view stateful_iterator =
+    R"XXX(
+extern "C" __device__ {3} {4}(const void *self_ptr);
+extern "C" __device__ void {5}(void *self_ptr, {0} offset);
+struct __align__({1}) input_iterator_state_t {{;
+  using iterator_category = cuda::std::random_access_iterator_tag;
+  using value_type = {3};
+  using difference_type = {0};
+  using pointer = {3}*;
+  using reference = {3}&;
+  __device__ inline value_type operator*() const {{ return {4}(this); }}
+  __device__ inline input_iterator_state_t& operator+=(difference_type diff) {{
+      {5}(this, diff);
+      return *this;
+  }}
+  __device__ inline value_type operator[](difference_type diff) const {{
+      return *(*this + diff);
+  }}
+  __device__ inline input_iterator_state_t operator+(difference_type diff) const {{
+      input_iterator_state_t result = *this;
+      result += diff;
+      return result;
+  }}
+  char data[{2}];
+}};
+
+using for_each_iterator_t = input_iterator_state_t;
+)XXX";
+
+  constexpr std::string_view stateless_iterator =
+    R"XXX(
+  using for_each_iterator_t = {0}*;
+)XXX";
+
+  return (iter.type == cccl_iterator_kind_t::iterator)
+         ? std::format(
+             stateful_iterator,
+             offset_t, // 0 - type
+             iter.alignment, // 1 - iter alignment
+             iter.size, // 2 - iter size
+             input_it_value_t, // 3 - iter value type
+             iter.dereference.name, // 4 - deref
+             iter.advance.name // 5 - advance name
+             )
+         : std::format(stateless_iterator, input_it_value_t);
+}
+
+static std::string get_for_kernel_user_op(cccl_op_t user_op, cccl_iterator_t iter)
+{
+  auto value_t = cccl_type_enum_to_string(iter.value_type.type);
+
+  constexpr std::string_view op_format =
+    R"XXX(
+#if {0}
+#  define _STATEFUL_USER_OP
+#endif
+
+#define _USER_OP {1}
+#define _USER_OP_INPUT_T {2}
+
+#if defined(_STATEFUL_USER_OP)
+extern "C" __device__ void _USER_OP(void*, _USER_OP_INPUT_T*);
+#else
+extern "C" __device__ void _USER_OP(_USER_OP_INPUT_T*);
+#endif
+
+#if defined(_STATEFUL_USER_OP)
+struct __align__({3}) user_op_t {{
+  char data[{4}];
+#else
+struct user_op_t {{
+#endif
+
+  __device__ void operator()(_USER_OP_INPUT_T* input) {{
+#if defined(_STATEFUL_USER_OP)
+    _USER_OP(&data, input);
+#else
+    _USER_OP(input);
+#endif
+  }}
+}};
+)XXX";
+
+  bool user_op_stateful = cccl_op_kind_t::stateful == user_op.type;
+
+  return std::format(
+    op_format,
+    user_op_stateful, // 0 - stateful user op
+    user_op.name, // 1 - user op function name
+    value_t, // 2 - user op input type
+    user_op.alignment, // 3 - state alignment
+    user_op.size // 4 - state size
+  );
+}
+
+std::string get_for_kernel(cccl_op_t user_op, cccl_iterator_t iter)
+{
+  auto storage_align = iter.value_type.alignment;
+  auto storage_size  = iter.value_type.size;
+
+  return std::format(
+    R"XXX(
+#include <cuda/std/iterator>
+#include <cub/agent/agent_for.cuh>
+#include <cub/device/dispatch/kernels/for_each.cuh>
+
+struct __align__({2}) storage_t {{
+  char data[{3}];
+}};
+
+// Iterator wrapper
+{0}
+
+// User operator wrapper
+{1}
+
+struct for_each_wrapper
+{{
+  for_each_iterator_t iterator;
+  user_op_t user_op;
+
+  __device__ void operator()(unsigned long long idx)
+  {{
+    user_op(iterator + idx);
+  }}
+}};
+
+using policy_dim_t = cub::detail::for_each::policy_t<256, 2>;
+
+struct device_for_policy
+{{
+  struct ActivePolicy
+  {{
+    using for_policy_t = policy_dim_t;
+  }};
+}};
+)XXX",
+    get_for_kernel_iterator(iter), // 0 - Iterator definition
+    get_for_kernel_user_op(user_op, iter), // 1 - User op wrapper definition,
+    storage_align, // 2 - User datatype alignment
+    storage_size // 3 - User datatype size
+  );
+}
+
+constexpr static std::tuple<size_t, size_t>
+calculate_kernel_state_sizes(size_t iter_size, size_t user_size, size_t user_align)
+{
+  size_t min_size       = iter_size;
+  size_t user_op_offset = 0;
+
+  if (user_size)
+  {
+    // Add space to match alignment provided by user
+    size_t alignment = (min_size & (user_align - 1));
+    if (alignment)
+    {
+      min_size += user_align - alignment;
+    }
+    // Capture offset where user function state begins
+    user_op_offset = min_size;
+    min_size += user_size;
+  }
+
+  return {min_size, user_op_offset};
+}
+
+static_assert(calculate_kernel_state_sizes(4, 8, 8) == std::tuple<size_t, size_t>{16, 8});
+static_assert(calculate_kernel_state_sizes(2, 8, 8) == std::tuple<size_t, size_t>{16, 8});
+static_assert(calculate_kernel_state_sizes(16, 8, 8) == std::tuple<size_t, size_t>{24, 16});
+static_assert(calculate_kernel_state_sizes(8, 8, 8) == std::tuple<size_t, size_t>{16, 8});
+static_assert(calculate_kernel_state_sizes(8, 16, 8) == std::tuple<size_t, size_t>{24, 8});
+static_assert(calculate_kernel_state_sizes(8, 16, 16) == std::tuple<size_t, size_t>{32, 16});
+
+for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterator)
+{
+  // Iterator is either a pointer or a stateful object, allocate space according to its size or alignment
+  size_t iter_size     = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.size : sizeof(void*);
+  size_t iter_align    = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.alignment : alignof(void*);
+  void* iterator_state = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.state : &iterator.state;
+
+  // Do we need to valid user input? Alignments larger than the provided size?
+  size_t user_size  = (cccl_op_kind_t::stateful == op.type) ? op.size : 0;
+  size_t user_align = (cccl_op_kind_t::stateful == op.type) ? op.alignment : 0;
+
+  auto [min_size, user_op_offset] = calculate_kernel_state_sizes(iter_size, user_size, user_align);
+
+  for_each_default local_buffer{};
+  char* iter_start = (char*) &local_buffer;
+
+  // Check if local blueprint provides enough space
+  bool use_allocated_storage = sizeof(for_each_default) < min_size;
+
+  if (use_allocated_storage)
+  {
+    // Allocate required space
+    iter_start = (char*) malloc(min_size);
+  }
+
+  // Memcpy into either local or allocated buffer
+  memcpy(iter_start, iterator_state, iter_size);
+  if (cccl_op_kind_t::stateful == op.type)
+  {
+    char* user_start = iter_start + user_op_offset;
+    memcpy(user_start, op.state, user_size);
+  }
+
+  // Return either local buffer or unique_ptr
+  if (use_allocated_storage)
+  {
+    return for_each_kernel_state{unique_void{(void*) iter_start}, user_op_offset};
+  }
+  else
+  {
+    return for_each_kernel_state{local_buffer, user_op_offset};
+  }
+}
+
+void* for_each_kernel_state::get()
+{
+  return std::visit(
+    [](auto&& v) -> void* {
+      using state_t = std::decay_t<decltype(v)>;
+      if constexpr (std::is_same_v<for_each_default, state_t>)
+      {
+        // Return the locally stored object as a void*
+        return &v;
+      }
+      else
+      {
+        // Return the allocated space as a void*
+        return v.get();
+      }
+    },
+    for_each_arg);
+}
diff --git a/c/src/for/for_op_helper.h b/c/src/for/for_op_helper.h
new file mode 100644
index 00000000000..c88351ca86f
--- /dev/null
+++ b/c/src/for/for_op_helper.h
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <variant>
+
+#include <cccl/c/types.h>
+
+// For each kernel accepts a user operator that contains both iterator and user operator state
+// This declaration is used as blueprint for aligned_storage, but is only *valid* in the generated NVRTC program.
+struct for_each_default
+{
+  // Defaults:
+  void* iterator; // A pointer for iterator
+  void* user_op; // A pointer for user data
+};
+
+struct unique_free_void
+{
+  inline void operator()(void* p)
+  {
+    if (p)
+    {
+      free(p);
+    }
+  }
+};
+
+using unique_void = std::unique_ptr<void, unique_free_void>;
+
+struct for_each_kernel_state
+{
+  std::variant<for_each_default, unique_void> for_each_arg;
+  size_t user_op_offset;
+
+  // Get address of argument for kernel
+  void* get();
+};
+
+std::string get_for_kernel(cccl_op_t user_op, cccl_iterator_t iter);
+
+for_each_kernel_state make_for_kernel_state(cccl_op_t user_op, cccl_iterator_t iterator);
diff --git a/c/src/reduce.cu b/c/src/reduce.cu
index fc88dd31a98..97f8793bb78 100644
--- a/c/src/reduce.cu
+++ b/c/src/reduce.cu
@@ -19,36 +19,13 @@
 #include <iostream>
 #include <memory>
 
+#include "util/context.h"
+#include "util/errors.h"
+#include "util/types.h"
 #include <cccl/c/reduce.h>
 #include <nvJitLink.h>
 #include <nvrtc.h>
 
-void check(nvrtcResult result)
-{
-  if (result != NVRTC_SUCCESS)
-  {
-    throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result));
-  }
-}
-
-void check(CUresult result)
-{
-  if (result != CUDA_SUCCESS)
-  {
-    const char* str = nullptr;
-    cuGetErrorString(result, &str);
-    throw std::runtime_error(std::string("CUDA error: ") + str);
-  }
-}
-
-void check(nvJitLinkResult result)
-{
-  if (result != NVJITLINK_SUCCESS)
-  {
-    throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result));
-  }
-}
-
 struct op_wrapper;
 struct device_reduce_policy;
 using TransformOpT = ::cuda::std::__identity;
@@ -65,127 +42,6 @@ struct runtime_tuning_policy
   int vector_load_length;
 };
 
-struct storage_t;
-struct input_iterator_state_t;
-struct output_iterator_t;
-
-char const* cccl_type_enum_to_string(cccl_type_enum type)
-{
-  switch (type)
-  {
-    case cccl_type_enum::INT8:
-      return "::cuda::std::int8_t";
-    case cccl_type_enum::INT16:
-      return "::cuda::std::int16_t";
-    case cccl_type_enum::INT32:
-      return "::cuda::std::int32_t";
-    case cccl_type_enum::INT64:
-      return "::cuda::std::int64_t";
-    case cccl_type_enum::UINT8:
-      return "::cuda::std::uint8_t";
-    case cccl_type_enum::UINT16:
-      return "::cuda::std::uint16_t";
-    case cccl_type_enum::UINT32:
-      return "::cuda::std::uint32_t";
-    case cccl_type_enum::UINT64:
-      return "::cuda::std::uint64_t";
-    case cccl_type_enum::FLOAT32:
-      return "float";
-    case cccl_type_enum::FLOAT64:
-      return "double";
-    case cccl_type_enum::STORAGE:
-      return "storage_t";
-  }
-  return "unknown";
-}
-
-std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false)
-{
-  std::string result;
-
-  if (is_pointer)
-  {
-    switch (type)
-    {
-      case cccl_type_enum::INT8:
-
-        check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result));
-        break;
-      case cccl_type_enum::INT16:
-        check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result));
-        break;
-      case cccl_type_enum::INT32:
-        check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result));
-        break;
-      case cccl_type_enum::INT64:
-        check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result));
-        break;
-      case cccl_type_enum::UINT8:
-        check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result));
-        break;
-      case cccl_type_enum::UINT16:
-        check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result));
-        break;
-      case cccl_type_enum::UINT32:
-        check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result));
-        break;
-      case cccl_type_enum::UINT64:
-        check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result));
-        break;
-      case cccl_type_enum::FLOAT32:
-        check(nvrtcGetTypeName<float*>(&result));
-        break;
-      case cccl_type_enum::FLOAT64:
-        check(nvrtcGetTypeName<double*>(&result));
-        break;
-      case cccl_type_enum::STORAGE:
-        check(nvrtcGetTypeName<storage_t*>(&result));
-        break;
-    }
-  }
-  else
-  {
-    switch (type)
-    {
-      case cccl_type_enum::INT8:
-        check(nvrtcGetTypeName<::cuda::std::int8_t>(&result));
-        break;
-      case cccl_type_enum::INT16:
-        check(nvrtcGetTypeName<::cuda::std::int16_t>(&result));
-        break;
-      case cccl_type_enum::INT32:
-        check(nvrtcGetTypeName<::cuda::std::int32_t>(&result));
-        break;
-      case cccl_type_enum::INT64:
-        check(nvrtcGetTypeName<::cuda::std::int64_t>(&result));
-        break;
-      case cccl_type_enum::UINT8:
-        check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result));
-        break;
-      case cccl_type_enum::UINT16:
-        check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result));
-        break;
-      case cccl_type_enum::UINT32:
-        check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result));
-        break;
-      case cccl_type_enum::UINT64:
-        check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result));
-        break;
-      case cccl_type_enum::FLOAT32:
-        check(nvrtcGetTypeName<float>(&result));
-        break;
-      case cccl_type_enum::FLOAT64:
-        check(nvrtcGetTypeName<double>(&result));
-        break;
-      case cccl_type_enum::STORAGE:
-        check(nvrtcGetTypeName<storage_t>(&result));
-        break;
-    }
-  }
-
-  return result;
-}
-
 struct reduce_tuning_t
 {
   int cc;
@@ -195,7 +51,7 @@ struct reduce_tuning_t
 };
 
 template <int N>
-reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
+static reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
 {
   for (const reduce_tuning_t& tuning : tunings)
   {
@@ -208,7 +64,7 @@ reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
   return tunings[N - 1];
 }
 
-runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
+static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
 {
   reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}};
 
@@ -221,14 +77,14 @@ runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_t
   return {block_size, items_per_thread, vector_load_length};
 }
 
-cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+static cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
 {
   // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
   //      so switching back to the old accumulator type logic for now
   return init.type;
 }
 
-cudaError_t InvokeSingleTile(
+static cudaError_t InvokeSingleTile(
   void* d_temp_storage,
   std::size_t& temp_storage_bytes,
   cccl_iterator_t d_in,
@@ -271,7 +127,7 @@ cudaError_t InvokeSingleTile(
   return error;
 }
 
-cudaError_t InvokePasses(
+static cudaError_t InvokePasses(
   void* d_temp_storage,
   std::size_t& temp_storage_bytes,
   cccl_iterator_t d_in,
@@ -379,7 +235,7 @@ cudaError_t InvokePasses(
   return error;
 }
 
-cudaError_t Invoke(
+static cudaError_t Invoke(
   void* d_temp_storage,
   std::size_t& temp_storage_bytes,
   cccl_iterator_t d_in,
@@ -423,6 +279,9 @@ cudaError_t Invoke(
   }
 }
 
+struct input_iterator_state_t;
+struct output_iterator_t;
+
 std::string get_input_iterator_name()
 {
   std::string iterator_t;
@@ -504,24 +363,6 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it
     transform_op_t);
 }
 
-bool try_push_context()
-{
-  CUcontext context = nullptr;
-
-  check(cuCtxGetCurrent(&context));
-
-  if (context == nullptr)
-  {
-    const int default_device = 0;
-    check(cuDevicePrimaryCtxRetain(&context, default_device));
-    check(cuCtxPushCurrent(context));
-
-    return true;
-  }
-
-  return false;
-}
-
 extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   cccl_device_reduce_build_result_t* build,
   cccl_iterator_t input_it,
diff --git a/c/src/util/context.cpp b/c/src/util/context.cpp
new file mode 100644
index 00000000000..7457c9c9c08
--- /dev/null
+++ b/c/src/util/context.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdexcept>
+
+#include "errors.h"
+
+bool try_push_context()
+{
+  CUcontext context = nullptr;
+
+  check(cuCtxGetCurrent(&context));
+
+  if (context == nullptr)
+  {
+    const int default_device = 0;
+    check(cuDevicePrimaryCtxRetain(&context, default_device));
+    check(cuCtxPushCurrent(context));
+
+    return true;
+  }
+
+  return false;
+}
diff --git a/c/src/util/context.h b/c/src/util/context.h
new file mode 100644
index 00000000000..fb597ef08e1
--- /dev/null
+++ b/c/src/util/context.h
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cuda.h>
+
+#include <nvJitLink.h>
+#include <nvrtc.h>
+
+bool try_push_context();
diff --git a/c/src/util/errors.cpp b/c/src/util/errors.cpp
new file mode 100644
index 00000000000..3c2020795e6
--- /dev/null
+++ b/c/src/util/errors.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include "errors.h"
+
+#include <stdexcept>
+
+void check(nvrtcResult result)
+{
+  if (result != NVRTC_SUCCESS)
+  {
+    throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result));
+  }
+}
+
+void check(CUresult result)
+{
+  if (result != CUDA_SUCCESS)
+  {
+    const char* str = nullptr;
+    cuGetErrorString(result, &str);
+    throw std::runtime_error(std::string("CUDA error: ") + str);
+  }
+}
+
+void check(nvJitLinkResult result)
+{
+  if (result != NVJITLINK_SUCCESS)
+  {
+    throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result));
+  }
+}
diff --git a/c/src/util/errors.h b/c/src/util/errors.h
new file mode 100644
index 00000000000..e0b3b27eac3
--- /dev/null
+++ b/c/src/util/errors.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cuda.h>
+
+#include <nvJitLink.h>
+#include <nvrtc.h>
+
+void check(nvrtcResult result);
+void check(CUresult result);
+void check(nvJitLinkResult result);
diff --git a/c/src/util/types.cpp b/c/src/util/types.cpp
new file mode 100644
index 00000000000..e0a2272e9e5
--- /dev/null
+++ b/c/src/util/types.cpp
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include "types.h"
+
+#include <cuda/std/cstdint>
+
+#include "errors.h"
+#include <nvJitLink.h>
+#include <nvrtc.h>
+
+char const* cccl_type_enum_to_string(cccl_type_enum type)
+{
+  switch (type)
+  {
+    case cccl_type_enum::INT8:
+      return "::cuda::std::int8_t";
+    case cccl_type_enum::INT16:
+      return "::cuda::std::int16_t";
+    case cccl_type_enum::INT32:
+      return "::cuda::std::int32_t";
+    case cccl_type_enum::INT64:
+      return "::cuda::std::int64_t";
+    case cccl_type_enum::UINT8:
+      return "::cuda::std::uint8_t";
+    case cccl_type_enum::UINT16:
+      return "::cuda::std::uint16_t";
+    case cccl_type_enum::UINT32:
+      return "::cuda::std::uint32_t";
+    case cccl_type_enum::UINT64:
+      return "::cuda::std::uint64_t";
+    case cccl_type_enum::FLOAT32:
+      return "float";
+    case cccl_type_enum::FLOAT64:
+      return "double";
+    case cccl_type_enum::STORAGE:
+      return "storage_t";
+  }
+  return "unknown";
+}
+
+std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer)
+{
+  std::string result;
+
+  if (is_pointer)
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+
+        check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float*>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double*>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t*>(&result));
+        break;
+    }
+  }
+  else
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+        check(nvrtcGetTypeName<::cuda::std::int8_t>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t>(&result));
+        break;
+    }
+  }
+
+  return result;
+}
diff --git a/c/src/util/types.h b/c/src/util/types.h
new file mode 100644
index 00000000000..feae5f0ae6f
--- /dev/null
+++ b/c/src/util/types.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <string>
+
+#include <cccl/c/types.h>
+
+struct storage_t;
+
+std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false);
+char const* cccl_type_enum_to_string(cccl_type_enum type);
diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt
index 6a8599500ef..a9223faa4dc 100644
--- a/c/test/CMakeLists.txt
+++ b/c/test/CMakeLists.txt
@@ -1,9 +1,17 @@
 add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp)
+add_executable(cccl.c.test.for test_for.cpp test_main.cpp)
 
 target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
+target_link_libraries(cccl.c.test.for PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
 
 target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL
                                                      TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
                                                      TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
                                                      TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
                                                      TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
+
+target_compile_definitions(cccl.c.test.for PRIVATE CCCL_C_EXPERIMENTAL
+                                                     TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+                                                     TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
diff --git a/c/test/c2h.h b/c/test/c2h.h
index e044d2e17a2..ecae70d3558 100644
--- a/c/test/c2h.h
+++ b/c/test/c2h.h
@@ -82,7 +82,16 @@ static std::string compile(const std::string& source)
   REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, source.c_str(), "op.cu", 0, nullptr, nullptr));
 
   const char* options[] = {"--std=c++17", "-rdc=true", "-dlto"};
-  REQUIRE(NVRTC_SUCCESS == nvrtcCompileProgram(prog, 3, options));
+
+  if (nvrtcCompileProgram(prog, 3, options) != NVRTC_SUCCESS)
+  {
+    size_t log_size{};
+    REQUIRE(NVRTC_SUCCESS == nvrtcGetProgramLogSize(prog, &log_size));
+    std::unique_ptr<char[]> log{new char[log_size]};
+    REQUIRE(NVRTC_SUCCESS == nvrtcGetProgramLog(prog, log.get()));
+    printf("%s\r\n", log.get());
+    REQUIRE(false);
+  }
 
   std::size_t ltoir_size{};
   REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIRSize(prog, &ltoir_size));
@@ -147,7 +156,7 @@ cccl_type_info get_type_info()
   return info;
 }
 
-static std::string get_op(cccl_type_enum t)
+static std::string get_reduce_op(cccl_type_enum t)
 {
   switch (t)
   {
@@ -169,20 +178,43 @@ static std::string get_op(cccl_type_enum t)
   return "";
 }
 
+static std::string get_for_op(cccl_type_enum t)
+{
+  switch (t)
+  {
+    case cccl_type_enum::INT8:
+      return "extern \"C\" __device__ void op(char* a) {(*a)++;}";
+    case cccl_type_enum::INT32:
+      return "extern \"C\" __device__ void op(int* a) {(*a)++;}";
+    case cccl_type_enum::UINT32:
+      return "extern \"C\" __device__ void op(unsigned int* a) {(*a)++;}";
+    case cccl_type_enum::INT64:
+      return "extern \"C\" __device__ void op(long long* a) {(*a)++;}";
+    case cccl_type_enum::UINT64:
+      return "extern \"C\" __device__ void op(unsigned long long* a) {(*a)++;}";
+    default:
+      throw std::runtime_error("Unsupported type");
+  }
+  return "";
+}
+
 template <class T>
 struct pointer_t
 {
   T* ptr{};
+  size_t size{};
 
   pointer_t(int num_items)
   {
     REQUIRE(cudaSuccess == cudaMalloc(&ptr, num_items * sizeof(T)));
+    size = num_items;
   }
 
   pointer_t(const std::vector<T>& vec)
   {
     REQUIRE(cudaSuccess == cudaMalloc(&ptr, vec.size() * sizeof(T)));
     REQUIRE(cudaSuccess == cudaMemcpy(ptr, vec.data(), vec.size() * sizeof(T), cudaMemcpyHostToDevice));
+    size = vec.size();
   }
 
   ~pointer_t()
@@ -211,6 +243,13 @@ struct pointer_t
     it.value_type = get_type_info<T>();
     return it;
   }
+
+  operator std::vector<T>() const
+  {
+    std::vector<T> vec(size);
+    REQUIRE(cudaSuccess == cudaMemcpy(vec.data(), ptr, sizeof(T) * size, cudaMemcpyDeviceToHost));
+    return vec;
+  }
 };
 
 struct operation_t
diff --git a/c/test/test_for.cpp b/c/test/test_for.cpp
new file mode 100644
index 00000000000..472a35ffbba
--- /dev/null
+++ b/c/test/test_for.cpp
@@ -0,0 +1,204 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+
+#include "c2h.h"
+#include <cccl/c/for.h>
+
+void for_each(cccl_iterator_t input, unsigned long long num_items, cccl_op_t op)
+{
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  const int cc_major = deviceProp.major;
+  const int cc_minor = deviceProp.minor;
+
+  const char* cub_path        = TEST_CUB_PATH;
+  const char* thrust_path     = TEST_THRUST_PATH;
+  const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
+  const char* ctk_path        = TEST_CTK_PATH;
+
+  cccl_device_for_build_result_t build;
+  REQUIRE(
+    CUDA_SUCCESS
+    == cccl_device_for_build(&build, input, op, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
+  const std::string sass = inspect_sass(build.cubin, build.cubin_size);
+  REQUIRE(sass.find("LDL") == std::string::npos);
+  REQUIRE(sass.find("STL") == std::string::npos);
+
+  REQUIRE(CUDA_SUCCESS == cccl_device_for(build, input, num_items, op, 0));
+  REQUIRE(CUDA_SUCCESS == cccl_device_for_cleanup(&build));
+}
+
+using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
+TEMPLATE_LIST_TEST_CASE("for works with integral types", "[for]", integral_types)
+{
+  const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation("op", get_for_op(get_type_info<TestType>().type));
+  std::vector<TestType> input(num_items, TestType(1));
+  pointer_t<TestType> input_ptr(input);
+
+  for_each(input_ptr, num_items, op);
+
+  // Copy back input array
+  input          = input_ptr;
+  bool all_match = true;
+  std::for_each(input.begin(), input.end(), [&](auto v) {
+    if (v != 2)
+    {
+      all_match = false;
+    }
+  });
+
+  REQUIRE(all_match);
+}
+
+struct pair
+{
+  short a;
+  size_t b;
+};
+
+TEST_CASE("for works with custom types", "[for]")
+{
+  const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation("op",
+                                  R"XXX(
+struct pair { short a; size_t b; };
+extern "C" __device__ void op(pair* a) {a->a++; a->b++;}
+)XXX");
+
+  std::vector<pair> input(num_items, pair{short(1), size_t(1)});
+  pointer_t<pair> input_ptr(input);
+
+  for_each(input_ptr, num_items, op);
+
+  // Copy back input array
+  input          = input_ptr;
+  bool all_match = true;
+  std::for_each(input.begin(), input.end(), [&](auto v) {
+    if (v.a != 2 || v.b != 2)
+    {
+      all_match = false;
+    }
+  });
+
+  REQUIRE(all_match);
+}
+
+struct invocation_counter_state_t
+{
+  int* d_counter;
+};
+
+TEST_CASE("for works with stateful operators", "[for]")
+{
+  const int num_items = 1 << 12;
+  pointer_t<int> counter(1);
+  invocation_counter_state_t op_state                 = {counter.ptr};
+  stateful_operation_t<invocation_counter_state_t> op = make_operation(
+    "op",
+    R"XXX(
+struct invocation_counter_state_t { int* d_counter; };
+extern "C" __device__ void op(invocation_counter_state_t* state, int* a) {
+  atomicAdd(state->d_counter, *a);
+}
+)XXX",
+    op_state);
+
+  std::vector<int> input(num_items, 1);
+  pointer_t<int> input_ptr(input);
+
+  for_each(input_ptr, num_items, op);
+
+  const int invocation_count = counter[0];
+  REQUIRE(invocation_count == num_items);
+}
+
+struct large_state_t
+{
+  int x;
+  int* d_counter;
+  int y, z, a;
+};
+
+TEST_CASE("for works with large stateful operators", "[for]")
+{
+  const int num_items = 1 << 12;
+  pointer_t<int> counter(1);
+  large_state_t op_state                 = {1, counter.ptr, 2, 3, 4};
+  stateful_operation_t<large_state_t> op = make_operation(
+    "op",
+    R"XXX(
+struct large_state_t
+{
+  int x;
+  int* d_counter;
+  int y, z, a;
+};
+extern "C" __device__ void op(large_state_t* state, int* a) {
+  atomicAdd(state->d_counter, *a);
+}
+)XXX",
+    op_state);
+
+  std::vector<int> input(num_items, 1);
+  pointer_t<int> input_ptr(input);
+
+  for_each(input_ptr, num_items, op);
+
+  const int invocation_count = counter[0];
+  REQUIRE(invocation_count == num_items);
+}
+
+template <class T>
+struct constant_iterator_state_t
+{
+  T value;
+};
+
+// TODO:
+/*
+TEST_CASE("for works with iterators", "[for]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+
+  iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
+    "struct constant_iterator_state_t { int value; };\n",
+    {"in_advance", "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {}"},
+    {"in_dereference",
+     "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 1;
+
+  pointer_t<int> counter(1);
+  invocation_counter_state_t op_state                 = {counter.ptr};
+  stateful_operation_t<invocation_counter_state_t> op = make_operation(
+    "op",
+    R"XXX(
+struct invocation_counter_state_t { int* d_counter; };
+extern "C" __device__ void op(invocation_counter_state_t* state, int a) {
+  atomicAdd(state->d_counter, a);
+}
+)XXX",
+    op_state);
+
+  for_each(input_it, num_items, op);
+
+  const int invocation_count = counter[0];
+  REQUIRE(invocation_count == num_items);
+}
+*/
diff --git a/c/test/test_reduce.cpp b/c/test/test_reduce.cpp
index 1a4607702ac..c98f350390a 100644
--- a/c/test/test_reduce.cpp
+++ b/c/test/test_reduce.cpp
@@ -49,7 +49,7 @@ using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
 TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types)
 {
   const int num_items               = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
-  operation_t op                    = make_operation("op", get_op(get_type_info<TestType>().type));
+  operation_t op                    = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
   const std::vector<TestType> input = generate<TestType>(num_items);
   pointer_t<TestType> input_ptr(input);
   pointer_t<TestType> output_ptr(1);
@@ -107,7 +107,7 @@ struct counting_iterator_state_t
 TEST_CASE("Reduce works with input iterators", "[reduce]")
 {
   const std::size_t num_items                         = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
-  operation_t op                                      = make_operation("op", get_op(get_type_info<int>().type));
+  operation_t op                                      = make_operation("op", get_reduce_op(get_type_info<int>().type));
   iterator_t<int, counting_iterator_state_t> input_it = make_iterator<int, counting_iterator_state_t>(
     "struct counting_iterator_state_t { int value; };\n",
     {"advance",
@@ -137,7 +137,7 @@ struct transform_output_iterator_state_t
 TEST_CASE("Reduce works with output iterators", "[reduce]")
 {
   const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
-  operation_t op      = make_operation("op", get_op(get_type_info<int>().type));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
   iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
     "struct transform_output_iterator_state_t { int* d_output; };\n",
     {"advance",
@@ -169,8 +169,8 @@ struct constant_iterator_state_t
 
 TEST_CASE("Reduce works with input and output iterators", "[reduce]")
 {
-  const int num_items                                      = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
-  operation_t op                                           = make_operation("op", get_op(get_type_info<int>().type));
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
   iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
     "struct constant_iterator_state_t { int value; };\n",
     {"in_advance",
@@ -205,7 +205,7 @@ TEST_CASE("Reduce works with input and output iterators", "[reduce]")
 TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
 {
   const int num_items = 1 << 14; // 16384 > 128
-  operation_t op      = make_operation("op", get_op(get_type_info<size_t>().type));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
   iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
     "struct constant_iterator_state_t { char value; };\n",
     {"in_advance",
@@ -229,7 +229,7 @@ TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
 TEST_CASE("Reduce works with large inputs", "[reduce]")
 {
   const size_t num_items = 1ull << 33;
-  operation_t op         = make_operation("op", get_op(get_type_info<size_t>().type));
+  operation_t op         = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
   iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
     "struct constant_iterator_state_t { char value; };\n",
     {"in_advance",

From 16f9a1af15afc87bb9c272028da3dacc9400a5df Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 20:19:20 +0200
Subject: [PATCH 6/9] Adds support for large number of items in `DeviceSelect`
 and `DevicePartition` (#2400)

* adds streaming selection and partition

* ensures policy lookup uses per-partition offset type

* mitigates perf degradation on select

* makes device interfaces use i64 num_items

* updates select::if large num_items tests

* fixes syntax

* adds tests for large num_items for select::flagged

* adds tests for large num_items for partition::if

* adds tests for large num_items for partition::flagged

* fixes redundant definition

* fixes implicit conversion

* fixes f32 select::if perf regression

* fixes perf regression for partition

* fix feature macro

* fixes feature macro

* fixes feature macro

* silences msvc constant conditional warning

* add support for streamin ctx dummy for partition with small offset types

* removes superfluous template parameter

* adds test for different offset types for partition::if & ::flagged

* adds tests and support for streaming select::unique

* fixes msvc warning

* fixes perf for partition

* fixes format

* fixes mixup for partition perf fix

* fixes syntax

* fixes partition:flagged perf

* fixes perf for partition::flagged

* switches unique to always use i64 offset types

* adds benchmark for partition with distinct iterators

* resolves merge conflicts

* fixes merge conflict

* makes sass identical to main for i32 partition

* updates thrust copy_if to always use i64 offset types

* fixes formatting

* minor style improvements

* addresses review comments

* fixes conditional type usage

* makes tests on empty input more robust

* restores empty problem behaviour

* adds comment on const ref
---
 cub/benchmarks/bench/partition/flagged.cu     |  50 ++-
 cub/benchmarks/bench/partition/if.cu          |  51 ++-
 cub/cub/agent/agent_select_if.cuh             | 103 +++--
 cub/cub/detail/choose_offset.cuh              |  51 ++-
 cub/cub/device/device_partition.cuh           |  77 +++-
 cub/cub/device/device_select.cuh              |  38 +-
 .../device/dispatch/dispatch_select_if.cuh    | 418 +++++++++++++-----
 .../catch2_test_device_partition_flagged.cu   |  69 ++-
 cub/test/catch2_test_device_partition_if.cu   |  83 +++-
 cub/test/catch2_test_device_select_common.cuh | 124 ++++++
 cub/test/catch2_test_device_select_flagged.cu |  50 ++-
 cub/test/catch2_test_device_select_if.cu      | 151 ++-----
 cub/test/catch2_test_device_select_unique.cu  |  88 +++-
 cub/test/catch2_test_util_choose_offset.cu    |  32 ++
 thrust/thrust/system/cuda/detail/copy_if.h    |  23 +-
 15 files changed, 1047 insertions(+), 361 deletions(-)
 create mode 100644 cub/test/catch2_test_device_select_common.cuh

diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index d881000701f..ab2fd83dca7 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -27,6 +27,10 @@
 
 #include <cub/device/device_partition.cuh>
 
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -77,16 +81,35 @@ struct policy_hub_t
 };
 #endif // TUNE_BASE
 
-template <typename T, typename OffsetT>
-void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(
+  FlagsItT d_flags,
+  OffsetT num_items,
+  T* d_out,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
-  using input_it_t        = const T*;
-  using flag_it_t         = const bool*;
-  using output_it_t       = T*;
-  using num_selected_it_t = OffsetT*;
-  using select_op_t       = cub::NullType;
-  using equality_op_t     = cub::NullType;
-  using offset_t          = OffsetT;
+  const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
+{
+  using input_it_t                           = const T*;
+  using flag_it_t                            = const bool*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = cub::NullType;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -127,8 +150,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 
   input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
   flag_it_t d_flags                = thrust::raw_pointer_cast(flags.data());
-  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
   num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);
@@ -158,8 +182,10 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index fac7c8e5c36..5fc4f82f6d9 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -27,6 +27,10 @@
 
 #include <cub/device/device_partition.cuh>
 
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -102,16 +106,36 @@ T value_from_entropy(double percentage)
   return static_cast<T>(result);
 }
 
-template <typename T, typename OffsetT>
-void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(
+  InItT d_in,
+  OffsetT num_items,
+  T* d_out,
+  SelectOpT select_op,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
-  using input_it_t        = const T*;
-  using flag_it_t         = cub::NullType*;
-  using output_it_t       = T*;
-  using num_selected_it_t = OffsetT*;
-  using select_op_t       = less_then_t<T>;
-  using equality_op_t     = cub::NullType;
-  using offset_t          = OffsetT;
+  const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
+{
+  using input_it_t                           = const T*;
+  using flag_it_t                            = cub::NullType*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = less_then_t<T>;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -153,8 +177,9 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 
   input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
   flag_it_t d_flags                = nullptr;
-  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
   num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);
@@ -183,8 +208,10 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 2a9958901ff..ea2d1c24b90 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -164,10 +164,16 @@ struct partition_distinct_output_t
  * selection)
  *
  * @tparam OffsetT
- *   Signed integer type for global offsets
+ *   Signed integer type for offsets within a partition
  *
- * @tparam ScanTileStateT
- *   The tile state class used in the decoupled look-back
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    is_first_partition() -> [Select::Unique-only] whether this is the first partition
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
  *
  * @tparam KEEP_REJECTS
  *   Whether or not we push rejected items to the back of the output
@@ -179,6 +185,7 @@ template <typename AgentSelectIfPolicyT,
           typename SelectOpT,
           typename EqualityOpT,
           typename OffsetT,
+          typename StreamingContextT,
           bool KEEP_REJECTS,
           bool MayAlias>
 struct AgentSelectIf
@@ -304,12 +311,15 @@ struct AgentSelectIf
 
   _TempStorage& temp_storage; ///< Reference to temp_storage
   WrappedInputIteratorT d_in; ///< Input items
-  OutputIteratorWrapperT d_selected_out; ///< Unique output items
+  OutputIteratorWrapperT d_selected_out; ///< Output iterator for the selected items
   WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable)
   InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
   SelectOpT select_op; ///< Selection operator
   OffsetT num_items; ///< Total number of input items
 
+  // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise
+  const StreamingContextT& streaming_context; ///< Context for the current partition
+
   //---------------------------------------------------------------------
   // Constructor
   //---------------------------------------------------------------------
@@ -335,6 +345,9 @@ struct AgentSelectIf
    *
    * @param num_items
    *   Total number of input items
+   *
+   * @param streaming_context
+   *   Context for the current partition
    */
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentSelectIf(
     TempStorage& temp_storage,
@@ -343,7 +356,8 @@ struct AgentSelectIf
     OutputIteratorWrapperT d_selected_out,
     SelectOpT select_op,
     EqualityOpT equality_op,
-    OffsetT num_items)
+    OffsetT num_items,
+    const StreamingContextT& streaming_context)
       : temp_storage(temp_storage.Alias())
       , d_in(d_in)
       , d_selected_out(d_selected_out)
@@ -351,6 +365,7 @@ struct AgentSelectIf
       , inequality_op(equality_op)
       , select_op(select_op)
       , num_items(num_items)
+      , streaming_context(streaming_context)
   {}
 
   //---------------------------------------------------------------------
@@ -404,11 +419,12 @@ struct AgentSelectIf
         selection_flags[ITEM] = true;
       }
       // Guarded loads
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items);
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items);
     }
     else
     {
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
     }
 
 #pragma unroll
@@ -440,11 +456,12 @@ struct AgentSelectIf
     if (IS_LAST_TILE)
     {
       // Out-of-bounds items are selection_flags
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 1);
     }
     else
     {
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
     }
 
 // Convert flag type to selection_flags type
@@ -466,7 +483,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_DISCONTINUITY> /*select_method*/)
   {
-    if (IS_FIRST_TILE)
+    if (IS_FIRST_TILE && streaming_context.is_first_partition())
     {
       CTA_SYNC();
 
@@ -478,7 +495,7 @@ struct AgentSelectIf
       InputT tile_predecessor;
       if (threadIdx.x == 0)
       {
-        tile_predecessor = d_in[tile_offset - 1];
+        tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1];
       }
 
       CTA_SYNC();
@@ -506,7 +523,7 @@ struct AgentSelectIf
   /**
    * Scatter flagged items to output offsets (specialized for direct scattering).
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedDirect(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -521,7 +538,7 @@ struct AgentSelectIf
       {
         if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
         {
-          d_selected_out[selection_indices[ITEM]] = items[ITEM];
+          *((d_selected_out + streaming_context.num_previously_selected()) + selection_indices[ITEM]) = items[ITEM];
         }
       }
     }
@@ -545,7 +562,7 @@ struct AgentSelectIf
    * @param is_keep_rejects
    *   Marker type indicating whether to keep rejected items in the second partition
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedTwoPhase(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -570,7 +587,8 @@ struct AgentSelectIf
 
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
-      d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+      *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
+        temp_storage.raw_exchange.Alias()[item];
     }
   }
 
@@ -592,7 +610,7 @@ struct AgentSelectIf
    * @param num_selections
    *   Total number of selections including this tile
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -608,12 +626,12 @@ struct AgentSelectIf
     // greater than one
     if (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))
     {
-      ScatterSelectedTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+      ScatterSelectedTwoPhase<IS_LAST_TILE>(
         items, selection_flags, selection_indices, num_tile_selections, num_selections_prefix);
     }
     else
     {
-      ScatterSelectedDirect<IS_LAST_TILE, IS_FIRST_TILE>(items, selection_flags, selection_indices, num_selections);
+      ScatterSelectedDirect<IS_LAST_TILE>(items, selection_flags, selection_indices, num_selections);
     }
   }
 
@@ -636,7 +654,7 @@ struct AgentSelectIf
    * @param is_keep_rejects
    *   Marker type indicating whether to keep rejected items in the second partition
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -669,7 +687,7 @@ struct AgentSelectIf
     CTA_SYNC();
 
     // Gather items from shared memory and scatter to global
-    ScatterPartitionsToGlobal<IS_LAST_TILE, IS_FIRST_TILE>(
+    ScatterPartitionsToGlobal<IS_LAST_TILE>(
       num_tile_items, tile_num_rejections, num_selections_prefix, num_rejected_prefix, d_selected_out);
   }
 
@@ -677,14 +695,17 @@ struct AgentSelectIf
    * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to two
    * distinct partitions.
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE, typename SelectedItT, typename RejectedItT>
+  template <bool IS_LAST_TILE, typename SelectedItT, typename RejectedItT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
     OffsetT num_selections_prefix,
     OffsetT num_rejected_prefix,
-    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_it_wrapper)
+    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_wrapper)
   {
+    auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected();
+    auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected();
+
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
@@ -700,11 +721,11 @@ struct AgentSelectIf
       {
         if (item_idx >= tile_num_rejections)
         {
-          partitioned_out_it_wrapper.selected_it[scatter_offset] = item;
+          selected_out_it[scatter_offset] = item;
         }
         else
         {
-          partitioned_out_it_wrapper.rejected_it[scatter_offset] = item;
+          rejected_out_it[scatter_offset] = item;
         }
       }
     }
@@ -715,7 +736,7 @@ struct AgentSelectIf
    * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
    * writtem from the iterators end backwards.
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE, typename PartitionedOutputItT>
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -723,19 +744,23 @@ struct AgentSelectIf
     OffsetT num_rejected_prefix,
     PartitionedOutputItT partitioned_out_it)
   {
+    using total_offset_t = typename StreamingContextT::total_num_items_t;
+
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
       int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
       int rejection_idx = item_idx;
       int selection_idx = item_idx - tile_num_rejections;
-      OffsetT scatter_offset =
+      total_offset_t scatter_offset =
         (item_idx < tile_num_rejections)
-          ? num_items - num_rejected_prefix - rejection_idx - 1
-          : num_selections_prefix + selection_idx;
+          ? (streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected()
+             - static_cast<total_offset_t>(num_rejected_prefix) - static_cast<total_offset_t>(rejection_idx)
+             - total_offset_t{1})
+          : (streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix)
+             + static_cast<total_offset_t>(selection_idx));
 
       InputT item = temp_storage.raw_exchange.Alias()[item_idx];
-
       if (!IS_LAST_TILE || (item_idx < num_tile_items))
       {
         partitioned_out_it[scatter_offset] = item;
@@ -773,11 +798,12 @@ struct AgentSelectIf
     // Load items
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -809,7 +835,7 @@ struct AgentSelectIf
     }
 
     // Scatter flagged items
-    Scatter<IS_LAST_TILE, true>(
+    Scatter<IS_LAST_TILE>(
       items,
       selection_flags,
       selection_indices,
@@ -852,11 +878,12 @@ struct AgentSelectIf
     // Load items
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -890,7 +917,7 @@ struct AgentSelectIf
     // previous tiles' input items, in case of in-place compaction), because this is implicitly ensured through
     // execution dependency: The scatter stage requires the offset from the prefix-sum and it can only know the
     // prefix-sum after having read that from the decoupled look-back. Scatter flagged items
-    Scatter<IS_LAST_TILE, false>(
+    Scatter<IS_LAST_TILE>(
       items,
       selection_flags,
       selection_indices,
@@ -960,6 +987,8 @@ struct AgentSelectIf
     auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state};
 
     // Blocks are launched in increasing order, so just assign one tile per block
+    // TODO (elstehle): replacing this term with just `blockIdx.x` degrades perf for partition. Once we get to re-tune
+    // the algorithm, we want to replace this term with `blockIdx.x`
     int tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
     OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
 
@@ -976,8 +1005,8 @@ struct AgentSelectIf
 
       if (threadIdx.x == 0)
       {
-        // Output the total number of items selection_flags
-        *d_num_selected_out = num_selections;
+        // Update the number of selected items with this partition's selections
+        streaming_context.update_num_selected(d_num_selected_out, num_selections);
       }
     }
   }
diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index 18fd568d9b8..7dc5c6fd6fb 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -37,6 +37,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstdint>
 #include <cuda/std/iterator>
 #include <cuda/std/type_traits>
 
@@ -60,7 +61,7 @@ struct choose_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::conditional<sizeof(NumItemsT) <= 4, std::uint32_t, unsigned long long>::type;
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) <= 4), std::uint32_t, unsigned long long>;
 };
 
 /**
@@ -83,7 +84,7 @@ struct promote_small_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::conditional<sizeof(NumItemsT) < 4, std::int32_t, NumItemsT>::type;
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) < 4), std::int32_t, NumItemsT>;
 };
 
 /**
@@ -93,6 +94,52 @@ struct promote_small_offset
 template <typename NumItemsT>
 using promote_small_offset_t = typename promote_small_offset<NumItemsT>::type;
 
+/**
+ * choose_signed_offset checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type to be either int32 or int64, such that the selected offset type covers the range of NumItemsT
+ * unless it was uint64, in which case int64 will be used.
+ */
+template <typename NumItemsT>
+struct choose_signed_offset
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(::cuda::std::is_integral<NumItemsT>::value
+                  && !::cuda::std::is_same<typename ::cuda::std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
+
+  // Signed integer type for global offsets.
+  // uint32 -> int64, else
+  // LEQ 4B -> int32, else
+  // int64
+  using type =
+    ::cuda::std::_If<(::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
+                     ::cuda::std::int64_t,
+                     ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>>;
+
+  /**
+   * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue,
+   * otherwise returns cudaSuccess.
+   */
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items)
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
+    if (sizeof(NumItemsT) >= 8 && num_items > static_cast<NumItemsT>(::cuda::std::numeric_limits<type>::max()))
+    {
+      return cudaErrorInvalidValue;
+    }
+    _CCCL_DIAG_POP
+    return cudaSuccess;
+  }
+};
+
+/**
+ * choose_signed_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and
+ * selects the corresponding signed offset type based on it.
+ */
+template <typename NumItemsT>
+using choose_signed_offset_t = typename choose_signed_offset<NumItemsT>::type;
+
 /**
  * common_iterator_value sets member type to the common_type of
  * value_type for all argument types. used to get OffsetT in
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 28bfc377bdc..48666f1370b 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -42,6 +42,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/detail/nvtx.cuh>
 #include <cub/device/dispatch/dispatch_select_if.cuh>
 #include <cub/device/dispatch/dispatch_three_way_partition.cuh>
@@ -142,6 +143,9 @@ struct DevicePartition
   //! @tparam NumSelectedIteratorT
   //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -169,7 +173,11 @@ struct DevicePartition
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -177,13 +185,14 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged");
-    using OffsetT    = int; // Signed integer type for global offsets
-    using SelectOp   = NullType; // Selection op (not used)
-    using EqualityOp = NullType; // Equality operator (not used)
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using SelectOp      = NullType; // Selection op (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
     using DispatchSelectIfT =
       DispatchSelectIf<InputIteratorT,
                        FlagIterator,
@@ -194,6 +203,13 @@ struct DevicePartition
                        OffsetT,
                        true>;
 
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
     return DispatchSelectIfT::Dispatch(
       d_temp_storage,
       temp_storage_bytes,
@@ -208,7 +224,11 @@ struct DevicePartition
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -216,7 +236,7 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -305,6 +325,9 @@ struct DevicePartition
   //! @tparam SelectOp
   //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -331,21 +354,33 @@ struct DevicePartition
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     NumItemsT num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
-    using OffsetT      = int; // Signed integer type for global offsets
-    using FlagIterator = NullType*; // FlagT iterator type (not used)
-    using EqualityOp   = NullType; // Equality operator (not used)
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using FlagIterator  = NullType*; // FlagT iterator type (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
+
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
 
     using DispatchSelectIfT =
       DispatchSelectIf<InputIteratorT,
@@ -371,21 +406,25 @@ struct DevicePartition
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     NumItemsT num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
+    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp, NumItemsT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
@@ -500,10 +539,10 @@ public:
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        explicit LessThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return a < compare;
@@ -515,10 +554,10 @@ public:
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        explicit GreaterThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return a > compare;
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 332bbe6c7d2..22c9380ebe1 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -174,12 +174,12 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
 
@@ -212,7 +212,7 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -309,12 +309,12 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
 
@@ -349,7 +349,7 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -468,13 +468,13 @@ struct DeviceSelect
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
@@ -506,7 +506,7 @@ struct DeviceSelect
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
@@ -615,13 +615,13 @@ struct DeviceSelect
      size_t& temp_storage_bytes,
      IteratorT d_data,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
@@ -655,7 +655,7 @@ struct DeviceSelect
      size_t& temp_storage_bytes,
      IteratorT d_data,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
@@ -756,13 +756,13 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     SelectOp select_op,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using EqualityOp = NullType; // Equality operator (not used)
 
     return DispatchSelectIf<
@@ -861,13 +861,13 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     SelectOp select_op,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using EqualityOp = NullType; // Equality operator (not used)
 
     constexpr bool may_alias = true;
@@ -981,12 +981,12 @@ struct DeviceSelect
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t;
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using SelectOp     = NullType; // Selection op (not used)
     using EqualityOp   = Equality; // Default == operator
@@ -1019,7 +1019,7 @@ struct DeviceSelect
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 37528662a0d..0ddfe163852 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -65,6 +65,132 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
+
+namespace select
+{
+// Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+using per_partition_offset_t = ::cuda::std::int32_t;
+
+template <typename TotalNumItemsT, bool IsStreamingInvocation>
+class streaming_context_t
+{
+private:
+  bool first_partition = true;
+  bool last_partition  = false;
+  TotalNumItemsT total_num_items{};
+  TotalNumItemsT total_previous_num_items{};
+
+  // We use a double-buffer for keeping track of the number of previously selected items
+  TotalNumItemsT* d_num_selected_in  = nullptr;
+  TotalNumItemsT* d_num_selected_out = nullptr;
+
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(
+    TotalNumItemsT* d_num_selected_in,
+    TotalNumItemsT* d_num_selected_out,
+    TotalNumItemsT total_num_items,
+    bool is_last_partition)
+      : last_partition(is_last_partition)
+      , total_num_items(total_num_items)
+      , d_num_selected_in(d_num_selected_in)
+      , d_num_selected_out(d_num_selected_out)
+  {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last)
+  {
+    ::cuda::std::swap(d_num_selected_in, d_num_selected_out);
+    first_partition = false;
+    last_partition  = next_partition_is_the_last;
+    total_previous_num_items += num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return first_partition;
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  {
+    return first_partition ? TotalNumItemsT{0} : *d_num_selected_in;
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
+  };
+
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const
+  {
+    return total_num_items;
+  }
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
+  {
+    if (last_partition)
+    {
+      *user_num_selected_out_it = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+    else
+    {
+      *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+  }
+};
+
+template <typename TotalNumItemsT>
+class streaming_context_t<TotalNumItemsT, false>
+{
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT, bool) {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {};
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return true;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const
+  {
+    return num_partition_items;
+  }
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
+  {
+    *user_num_selected_out_it = num_selections;
+  }
+};
+
 /**
  * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`.
  */
@@ -79,7 +205,8 @@ struct agent_select_if_wrapper_t
             typename SelectedOutputIteratorT,
             typename SelectOpT,
             typename EqualityOpT,
-            typename OffsetT>
+            typename OffsetT,
+            typename StreamingContextT>
   struct agent_t
       : public AgentSelectIf<AgentSelectIfPolicyT,
                              InputIteratorT,
@@ -88,6 +215,7 @@ struct agent_select_if_wrapper_t
                              SelectOpT,
                              EqualityOpT,
                              OffsetT,
+                             StreamingContextT,
                              KeepRejects,
                              MayAlias>
   {
@@ -98,15 +226,24 @@ struct agent_select_if_wrapper_t
                         SelectOpT,
                         EqualityOpT,
                         OffsetT,
+                        StreamingContextT,
                         KeepRejects,
                         MayAlias>::AgentSelectIf;
   };
 };
+} // namespace select
 } // namespace detail
 
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
+// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want
+// to make this a CCCL macro?
+#if defined(_CCCL_CUDACC_BELOW_11_7) || (CUB_PTX_ARCH < 700)
+#  define _CUB_GRID_CONSTANT
+#else
+#  define _CUB_GRID_CONSTANT __grid_constant__
+#endif
 
 /**
  * Select kernel entry point (multi-block)
@@ -140,7 +277,15 @@ struct agent_select_if_wrapper_t
  *   to be used for selection)
  *
  * @tparam OffsetT
- *   Signed integer type for global offsets
+ *   Signed integer type for offsets within a partition
+ *
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
  *
  * @tparam KEEP_REJECTS
  *   Whether or not we push rejected items to the back of the output
@@ -172,6 +317,9 @@ struct agent_select_if_wrapper_t
  * @param[in] num_tiles
  *   Total number of tiles for the entire problem
  *
+ * @param[in] streaming_context
+ *   The context information for the current partition
+ *
  * @param[in] vsmem
  *   Memory to support virtual shared memory
  */
@@ -184,18 +332,20 @@ template <typename ChainedPolicyT,
           typename SelectOpT,
           typename EqualityOpT,
           typename OffsetT,
+          typename StreamingContextT,
           bool KEEP_REJECTS,
           bool MayAlias>
 __launch_bounds__(int(
   cub::detail::vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
     SelectOpT,
     EqualityOpT,
-    OffsetT>::agent_policy_t::BLOCK_THREADS))
+    OffsetT,
+    StreamingContextT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
     InputIteratorT d_in,
     FlagsInputIteratorT d_flags,
@@ -206,17 +356,19 @@ __launch_bounds__(int(
     EqualityOpT equality_op,
     OffsetT num_items,
     int num_tiles,
+    _CUB_GRID_CONSTANT const StreamingContextT streaming_context,
     cub::detail::vsmem_t vsmem)
 {
   using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
     SelectOpT,
     EqualityOpT,
-    OffsetT>;
+    OffsetT,
+    StreamingContextT>;
 
   using AgentSelectIfPolicyT = typename VsmemHelperT::agent_policy_t;
 
@@ -230,7 +382,7 @@ __launch_bounds__(int(
   typename AgentSelectIfT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem);
 
   // Process tiles
-  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items)
+  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items, streaming_context)
     .ConsumeRange(num_tiles, tile_status, d_num_selected_out);
 
   // If applicable, hints to discard modified cache lines for vsmem
@@ -282,7 +434,7 @@ template <typename InputIteratorT,
           bool MayAlias           = false,
           typename SelectedPolicy = detail::device_select_policy_hub<cub::detail::value_t<InputIteratorT>,
                                                                      cub::detail::value_t<FlagsInputIteratorT>,
-                                                                     OffsetT,
+                                                                     detail::select::per_partition_offset_t,
                                                                      MayAlias,
                                                                      KEEP_REJECTS>>
 struct DispatchSelectIf : SelectedPolicy
@@ -290,7 +442,25 @@ struct DispatchSelectIf : SelectedPolicy
   /******************************************************************************
    * Types and constants
    ******************************************************************************/
-  using ScanTileStateT = ScanTileState<OffsetT>;
+
+  // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+  using per_partition_offset_t = detail::select::per_partition_offset_t;
+
+  // Offset type large enough to represent any index within the input and output iterators
+  using num_total_items_t = OffsetT;
+
+  // Type used to provide streaming information about each partition's context
+  static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits<per_partition_offset_t>::max();
+
+  // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that
+  // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items)
+  static constexpr bool may_require_streaming =
+    (static_cast<::cuda::std::uint64_t>(partition_size)
+     < static_cast<::cuda::std::uint64_t>(::cuda::std::numeric_limits<OffsetT>::max()));
+
+  using streaming_context_t = detail::select::streaming_context_t<num_total_items_t, may_require_streaming>;
+
+  using ScanTileStateT = ScanTileState<per_partition_offset_t>;
 
   static constexpr int INIT_KERNEL_THREADS = 128;
 
@@ -402,21 +572,37 @@ struct DispatchSelectIf : SelectedPolicy
 
     using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
       Policy,
-      detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+      detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
       InputIteratorT,
       FlagsInputIteratorT,
       SelectedOutputIteratorT,
       SelectOpT,
       EqualityOpT,
-      OffsetT>;
-
+      per_partition_offset_t,
+      streaming_context_t>;
     cudaError error = cudaSuccess;
 
     constexpr auto block_threads    = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
     constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD;
-    constexpr int tile_size         = block_threads * items_per_thread;
-    int num_tiles                   = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
-    const auto vsmem_size           = num_tiles * VsmemHelperT::vsmem_per_block;
+    constexpr auto tile_size        = static_cast<OffsetT>(block_threads * items_per_thread);
+
+    // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
+    // The extra check of may_require_streaming ensures that OffsetT is larger than per_partition_offset_t to avoid
+    // truncation of partition_size
+    auto const max_partition_size =
+      (may_require_streaming && num_items > static_cast<OffsetT>(partition_size))
+        ? static_cast<OffsetT>(partition_size)
+        : num_items;
+
+    // The number of partitions required to "iterate" over the total input (ternary to avoid div-by-zero)
+    auto const num_partitions =
+      (max_partition_size == 0) ? static_cast<OffsetT>(1) : ::cuda::ceil_div(num_items, max_partition_size);
+
+    // The maximum number of tiles for which we will ever invoke the kernel
+    auto const max_num_tiles_per_invocation = static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
+
+    // The amount of virtual shared memory to allocate
+    const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block;
 
     do
     {
@@ -429,17 +615,20 @@ struct DispatchSelectIf : SelectedPolicy
       }
 
       // Specify temporary storage allocation requirements
-      size_t allocation_sizes[2] = {0ULL, vsmem_size};
+      ::cuda::std::size_t streaming_selection_storage_bytes =
+        (num_partitions > 1) ? 2 * sizeof(num_total_items_t) : ::cuda::std::size_t{0};
+      ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes};
 
-      // bytes needed for tile status descriptors
-      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      // Bytes needed for tile status descriptors
+      error =
+        CubDebug(ScanTileStateT::AllocationSize(static_cast<int>(max_num_tiles_per_invocation), allocation_sizes[0]));
       if (cudaSuccess != error)
       {
         break;
       }
 
       // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-      void* allocations[2] = {};
+      void* allocations[3] = {};
 
       error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
       if (cudaSuccess != error)
@@ -453,111 +642,117 @@ struct DispatchSelectIf : SelectedPolicy
         break;
       }
 
-      // Construct the tile status interface
-      ScanTileStateT tile_status;
-      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
-      // Log scan_init_kernel configuration
-      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
-
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
-      _CubLog(
-        "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
-
-      // Invoke scan_init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
-        .doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out);
+      // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
+      // and the total number (across all partitions) of items
+      num_total_items_t* tmp_num_selected_out = reinterpret_cast<num_total_items_t*>(allocations[2]);
+      streaming_context_t streaming_context{
+        tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)};
 
-      // Check for failure to launch
-      error = CubDebug(cudaPeekAtLastError());
-      if (cudaSuccess != error)
+      // Iterate over the partitions until all input is processed
+      for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)
       {
-        break;
-      }
+        OffsetT current_partition_offset = partition_idx * max_partition_size;
+        OffsetT current_num_items =
+          (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size;
+
+        // Construct the tile status interface
+        const auto current_num_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
+        ScanTileStateT tile_status;
+        error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0]));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
 
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+        // Log scan_init_kernel configuration
+        int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
-      // Return if empty problem
-      if (num_items == 0)
-      {
-        break;
-      }
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+                init_grid_size,
+                INIT_KERNEL_THREADS,
+                (long long) stream);
+#endif
+
+        // Invoke scan_init_kernel to initialize tile descriptors
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+          .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
 
-      // Get max x-dimension of grid
-      int max_dim_x;
-      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
 
-      // Get grid size for scanning tiles
-      dim3 scan_grid_size;
-      scan_grid_size.z = 1;
-      scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x);
-      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+        // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure
+        // that `scan_init_kernel` has written '0' to d_num_selected_out)
+        if (current_num_items == 0)
+        {
+          return cudaSuccess;
+        }
 
 // Log select_if_kernel configuration
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
-      {
-        // Get SM occupancy for select_if_kernel
-        int range_select_sm_occupancy;
-        error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
-                                        select_if_kernel,
-                                        block_threads));
+        {
+          // Get SM occupancy for select_if_kernel
+          int range_select_sm_occupancy;
+          error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                          select_if_kernel,
+                                          block_threads));
+          if (cudaSuccess != error)
+          {
+            return error;
+          }
+
+          _CubLog("Invoking select_if_kernel<<<%d, %d, 0, "
+                  "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                  current_num_tiles,
+                  block_threads,
+                  (long long) stream,
+                  items_per_thread,
+                  range_select_sm_occupancy);
+        }
+#endif
+
+        // Invoke select_if_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
+          .doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                static_cast<per_partition_offset_t>(current_num_items),
+                current_num_tiles,
+                streaming_context,
+                cub::detail::vsmem_t{allocations[1]});
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
-        _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, "
-                "%lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x,
-                scan_grid_size.y,
-                scan_grid_size.z,
-                block_threads,
-                (long long) stream,
-                items_per_thread,
-                range_select_sm_occupancy);
-      }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
-
-      // Invoke select_if_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
-        .doit(select_if_kernel,
-              d_in,
-              d_flags,
-              d_selected_out,
-              d_num_selected_out,
-              tile_status,
-              select_op,
-              equality_op,
-              num_items,
-              num_tiles,
-              cub::detail::vsmem_t{allocations[1]});
-
-      // Check for failure to launch
-      error = CubDebug(cudaPeekAtLastError());
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
 
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        break;
+        // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.)
+        streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions));
       }
     } while (0);
 
@@ -580,7 +775,8 @@ struct DispatchSelectIf : SelectedPolicy
         ScanTileStateT,
         SelectOpT,
         EqualityOpT,
-        OffsetT,
+        per_partition_offset_t,
+        streaming_context_t,
         KEEP_REJECTS,
         MayAlias>);
   }
diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 2317c4bfb2e..a5dfe079fa5 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -31,11 +31,19 @@
 #include <cub/device/device_partition.cuh>
 
 #include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
+#include <cuda/cmath>
+
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -85,6 +93,9 @@ using all_types =
 
 using types = c2h::type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::equal_comparable_t>>;
 
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
 CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partition_flagged]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -95,7 +106,7 @@ CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partitio
   c2h::device_vector<char> flags(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
@@ -371,3 +382,59 @@ CUB_TEST("DevicePartition::Flagged works with different output type", "[device][
   REQUIRE(num_selected == num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]", offset_types)
+try
+{
+  using type     = std::int64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  auto in       = thrust::make_counting_iterator(offset_t{0});
+  auto in_flags = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_flagged(in, in_flags, check_result_it, d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index 84890a1233f..b6721b3c9e9 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -31,11 +31,19 @@
 #include <cub/device/device_partition.cuh>
 
 #include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
+#include <cuda/cmath>
+
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -43,21 +51,6 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition_if);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
-template <typename T>
-struct less_than_t
-{
-  T compare;
-
-  explicit __host__ less_than_t(T compare)
-      : compare(compare)
-  {}
-
-  __host__ __device__ bool operator()(const T& a) const
-  {
-    return a < compare;
-  }
-};
-
 struct always_false_t
 {
   template <typename T>
@@ -90,6 +83,9 @@ using all_types =
 using types = c2h::
   type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
 
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
 CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -99,7 +95,7 @@ CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if]
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   partition_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
@@ -308,3 +304,58 @@ CUB_TEST("DevicePartition::If works with a different output type", "[device][par
   REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]", offset_types)
+try
+{
+  using type     = std::int64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  auto in = thrust::make_counting_iterator(offset_t{0});
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_if(
+    in, check_result_it, d_first_num_selected_out, num_items, less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh
new file mode 100644
index 00000000000..39f82d04612
--- /dev/null
+++ b/cub/test/catch2_test_device_select_common.cuh
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/std/type_traits>
+
+#include "catch2_test_helper.h"
+
+template <typename T>
+struct less_than_t
+{
+  T compare;
+
+  explicit __host__ less_than_t(T compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a < compare;
+  }
+};
+
+template <typename T>
+struct mod_n
+{
+  T mod;
+  __host__ __device__ bool operator()(T x)
+  {
+    return (x % mod == 0) ? true : false;
+  }
+};
+
+template <typename T>
+struct multiply_n
+{
+  T multiplier;
+  __host__ __device__ T operator()(T x)
+  {
+    return x * multiplier;
+  }
+};
+
+template <typename T, typename TargetT>
+struct modx_and_add_divy
+{
+  T mod;
+  T div;
+
+  __host__ __device__ TargetT operator()(T x)
+  {
+    return static_cast<TargetT>((x % mod) + (x / div));
+  }
+};
+
+template <typename SelectedItT, typename RejectedItT>
+struct index_to_expected_partition_op
+{
+  using value_t = typename ::cuda::std::iterator_traits<SelectedItT>::value_type;
+  SelectedItT expected_selected_it;
+  RejectedItT expected_rejected_it;
+  std::int64_t expected_num_selected;
+
+  template <typename OffsetT>
+  __host__ __device__ value_t operator()(OffsetT index)
+  {
+    return (index < static_cast<OffsetT>(expected_num_selected))
+           ? expected_selected_it[index]
+           : expected_rejected_it[index - expected_num_selected];
+  }
+};
+
+template <typename SelectedItT, typename RejectedItT>
+static index_to_expected_partition_op<SelectedItT, RejectedItT> make_index_to_expected_partition_op(
+  SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected)
+{
+  return index_to_expected_partition_op<SelectedItT, RejectedItT>{
+    expected_selected_it, expected_rejected_it, expected_num_selected};
+}
+
+template <typename ExpectedValuesItT>
+struct flag_correct_writes_op
+{
+  ExpectedValuesItT expected_it;
+  std::uint32_t* d_correctness_flags;
+
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  template <typename OffsetT, typename T>
+  __host__ __device__ void operator()(OffsetT index, T val)
+  {
+    // Set bit-flag if the correct result has been written at the given index
+    if (expected_it[index] == val)
+    {
+      OffsetT uint_index     = index / static_cast<OffsetT>(bits_per_element);
+      std::uint32_t bit_flag = 0x00000001U << (index % bits_per_element);
+      atomicOr(&d_correctness_flags[uint_index], bit_flag);
+    }
+  }
+};
+
+template <typename ExpectedValuesItT>
+flag_correct_writes_op<ExpectedValuesItT> static make_checking_write_op(
+  ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags)
+{
+  return flag_correct_writes_op<ExpectedValuesItT>{expected_it, d_correctness_flags};
+}
+
+static bool are_all_flags_set(c2h::device_vector<std::uint32_t>& flag_vector, std::size_t num_flags_to_check)
+{
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  bool all_flags_set                     = thrust::equal(
+    flag_vector.cbegin(),
+    flag_vector.cbegin() + (num_flags_to_check / bits_per_element),
+    thrust::make_constant_iterator(0xFFFFFFFFU));
+  if (num_flags_to_check % bits_per_element != 0)
+  {
+    std::uint32_t last_element_flags = (0x00000001U << (num_flags_to_check % bits_per_element)) - 0x01U;
+    all_flags_set = all_flags_set && (flag_vector[num_flags_to_check / bits_per_element] == last_element_flags);
+  }
+  return all_flags_set;
+}
diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu
index f3477787ecd..84dd38f0c3e 100644
--- a/cub/test/catch2_test_device_select_flagged.cu
+++ b/cub/test/catch2_test_device_select_flagged.cu
@@ -36,6 +36,7 @@
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -95,7 +96,7 @@ CUB_TEST("DeviceSelect::Flagged can run with empty input", "[device][select_flag
   c2h::device_vector<int> flags(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
@@ -325,7 +326,7 @@ CUB_TEST("DeviceSelect::Flagged works with flags that alias input", "[device][se
   REQUIRE(reference == out);
 }
 
-CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_if]", types)
+CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_flagged]", types)
 {
   using type = typename c2h::get<0, TestType>;
 
@@ -416,3 +417,48 @@ CUB_TEST("DeviceSelect::Flagged works with a different output type", "[device][s
   REQUIRE(num_selected == num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DeviceSelect::Flagged works for very large number of items", "[device][select_flagged]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // Input
+  constexpr offset_t match_every_nth = 1000000;
+  auto in                            = thrust::make_counting_iterator(static_cast<type>(0));
+  auto flags_in = thrust::make_transform_iterator(in, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
+  c2h::device_vector<type> out(expected_num_copied);
+  select_flagged(in, flags_in, out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == expected_num_copied);
+  auto expected_out_it =
+    thrust::make_transform_iterator(in, multiply_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index e38f9957d6b..4cd6a043700 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -42,69 +42,14 @@
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
-// TODO replace with DeviceSelect::If interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
-// Temporary wrapper that allows specializing the DeviceSelect algorithm for different offset types
-template <typename InputIteratorT,
-          typename OutputIteratorT,
-          typename NumSelectedIteratorT,
-          typename OffsetT,
-          typename SelectOp>
-CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_select_if_wrapper(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  InputIteratorT d_in,
-  OutputIteratorT d_out,
-  NumSelectedIteratorT d_num_selected_out,
-  OffsetT num_items,
-  SelectOp select_op,
-  cudaStream_t stream = 0)
-{
-  using flag_iterator_t = cub::NullType*;
-  using equality_op_t   = cub::NullType;
-
-  return cub::DispatchSelectIf<
-    InputIteratorT,
-    flag_iterator_t,
-    OutputIteratorT,
-    NumSelectedIteratorT,
-    SelectOp,
-    equality_op_t,
-    OffsetT,
-    false>::Dispatch(d_temp_storage,
-                     temp_storage_bytes,
-                     d_in,
-                     nullptr,
-                     d_out,
-                     d_num_selected_out,
-                     select_op,
-                     equality_op_t{},
-                     num_items,
-                     stream);
-}
-
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if);
-DECLARE_LAUNCH_WRAPPER(dispatch_select_if_wrapper, dispatch_select_if);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
-template <typename T>
-struct less_than_t
-{
-  T compare;
-
-  explicit __host__ less_than_t(T compare)
-      : compare(compare)
-  {}
-
-  __host__ __device__ bool operator()(const T& a) const
-  {
-    return a < compare;
-  }
-};
-
 struct equal_to_default_t
 {
   template <typename T>
@@ -132,26 +77,6 @@ struct always_true_t
   }
 };
 
-template <typename T>
-struct mod_n
-{
-  T mod;
-  __host__ __device__ bool operator()(T x)
-  {
-    return (x % mod == 0) ? true : false;
-  }
-};
-
-template <typename T>
-struct multiply_n
-{
-  T multiplier;
-  __host__ __device__ T operator()(T x)
-  {
-    return x * multiplier;
-  }
-};
-
 using all_types =
   c2h::type_list<std::uint8_t,
                  std::uint16_t,
@@ -166,8 +91,6 @@ using all_types =
 using types = c2h::
   type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
 
-using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
-
 CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -177,7 +100,7 @@ CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", typ
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
@@ -393,25 +316,24 @@ CUB_TEST("DeviceSelect::If works with a different output type", "[device][select
   REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
 }
 
-CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]", offset_types)
+CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]")
 try
 {
   using type     = std::int64_t;
-  using offset_t = typename c2h::get<0, TestType>;
-
-  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  auto num_items_max_ull =
-    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
-             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
-  offset_t num_items_min =
-    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
   offset_t num_items = GENERATE_COPY(
     values({
-      num_items_max,
-      static_cast<offset_t>(num_items_max - 1),
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
     }),
-    take(2, random(num_items_min, num_items_max)));
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
 
   // Input
   auto in = thrust::make_counting_iterator(static_cast<type>(0));
@@ -421,11 +343,10 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  std::size_t match_every_nth = 1000000;
-  offset_t expected_num_copied =
-    static_cast<offset_t>((static_cast<std::size_t>(num_items) + match_every_nth - 1ULL) / match_every_nth);
+  constexpr offset_t match_every_nth = 1000000;
+  offset_t expected_num_copied       = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
   c2h::device_vector<type> out(expected_num_copied);
-  dispatch_select_if(
+  select_if(
     in, out.begin(), d_first_num_selected_out, num_items, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
 
   // Ensure that we created the correct output
@@ -440,29 +361,30 @@ catch (std::bad_alloc&)
   // Exceeding memory is not a failure.
 }
 
-CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]", offset_types)
+CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]")
 try
 {
   using type     = std::uint8_t;
-  using offset_t = typename c2h::get<0, TestType>;
-
-  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  auto num_items_max_ull =
-    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
-             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
-  offset_t num_items_min =
-    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
   offset_t num_items = GENERATE_COPY(
     values({
-      num_items_max,
-      static_cast<offset_t>(num_items_max - 1),
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
     }),
-    take(2, random(num_items_min, num_items_max)));
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
 
-  // Prepare input
-  c2h::device_vector<type> in(num_items);
-  c2h::gen(CUB_SEED(1), in);
+  // Prepare input iterator: it[i] = (i%mod)+(i/div)
+  static constexpr offset_t mod = 200;
+  static constexpr offset_t div = 1000000000;
+  auto in                       = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), modx_and_add_divy<offset_t, type>{mod, div});
 
   // Prepare output
   c2h::device_vector<type> out(num_items);
@@ -472,11 +394,12 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  dispatch_select_if(in.cbegin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+  select_if(in, out.begin(), d_first_num_selected_out, num_items, always_true_t{});
 
   // Ensure that we created the correct output
   REQUIRE(num_selected_out[0] == num_items);
-  REQUIRE(in == out);
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), in);
+  REQUIRE(all_results_correct == true);
 }
 catch (std::bad_alloc&)
 {
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index 51c6200c624..ddecf4664e6 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -30,11 +30,16 @@
 
 #include <cub/device/device_select.cuh>
 
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+
+#include <cuda/cmath>
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -106,7 +111,7 @@ CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_uniqu
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_unique(in.begin(), out.begin(), d_num_selected_out, num_items);
@@ -264,3 +269,84 @@ CUB_TEST("DeviceSelect::Unique works with a different output type", "[device][se
   reference.resize(num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // All unique
+  SECTION("AllUnique")
+  {
+    auto in = thrust::make_counting_iterator(offset_t{0});
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == num_items);
+    bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+    REQUIRE(all_results_correct == true);
+  }
+
+  // All the same -> single unique
+  SECTION("AllSame")
+  {
+    auto in = thrust::make_constant_iterator(offset_t{0});
+    constexpr offset_t expected_num_unique{1};
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(expected_num_unique, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == expected_num_unique);
+    bool all_results_correct = are_all_flags_set(correctness_flags, expected_num_unique);
+    REQUIRE(all_results_correct == true);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_util_choose_offset.cu b/cub/test/catch2_test_util_choose_offset.cu
index 62b8e4757af..9da5b59239f 100644
--- a/cub/test/catch2_test_util_choose_offset.cu
+++ b/cub/test/catch2_test_util_choose_offset.cu
@@ -27,6 +27,8 @@
 
 #include <cub/detail/choose_offset.cuh>
 
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include "catch2_test_helper.h"
@@ -43,6 +45,36 @@ CUB_TEST("Tests choose_offset", "[util][type]")
   STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_offset_t<std::int64_t>, unsigned long long>::value);
 }
 
+CUB_TEST("Tests choose_signed_offset", "[util][type]")
+{
+  // Uses signed 64-bit type for unsigned signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::uint32_t>, std::int64_t>::value);
+
+  // Uses signed 32-bit type for signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int32_t>, std::int32_t>::value);
+
+  // Uses signed 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int8_t>, std::int32_t>::value);
+
+  // Uses signed 64-bit type for signed 64-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int64_t>, std::int64_t>::value);
+
+  // Offset type covers maximum number representable by a signed 32-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int32_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int32_t>::max()));
+
+  // Offset type covers maximum number representable by a signed 64-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int64_t>::max()));
+
+  // Offset type does not support maximum number representable by an unsigned 64-bit integer
+  REQUIRE(cudaErrorInvalidValue
+          == cub::detail::choose_signed_offset<std::uint64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::uint64_t>::max()));
+}
+
 CUB_TEST("Tests promote_small_offset", "[util][type]")
 {
   // Uses input type for types of at least 32 bits
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index 19dd014f59a..d8676a9749a 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -219,19 +219,16 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   cudaError_t status        = cudaSuccess;
   size_t temp_storage_bytes = 0;
 
-  // 32-bit offset-type dispatch
-  using dispatch32_t = DispatchCopyIf<MayAlias, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int32_t>;
-
   // 64-bit offset-type dispatch
+  // Since https://github.com/NVIDIA/cccl/pull/2400, cub::DeviceSelect is using a streaming approach that splits up
+  // inputs larger than INT_MAX into partitions of up to `INT_MAX` items each, repeatedly invoking the respective
+  // algorithm. With that approach, we can always use i64 offset types for DispatchSelectIf, because there's only very
+  // limited performance upside for using i32 offset types. This avoids potentially duplicate kernel compilation.
   using dispatch64_t = DispatchCopyIf<MayAlias, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int64_t>;
 
   // Query temporary storage requirements
-  THRUST_INDEX_TYPE_DISPATCH2(
-    status,
-    dispatch32_t::dispatch,
-    dispatch64_t::dispatch,
-    num_items,
-    (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
+  status = dispatch64_t::dispatch(
+    policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, static_cast<std::int64_t>(num_items));
   cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
 
   // Allocate temporary storage.
@@ -239,12 +236,8 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   void* temp_storage = static_cast<void*>(tmp.data().get());
 
   // Run algorithm
-  THRUST_INDEX_TYPE_DISPATCH2(
-    status,
-    dispatch32_t::dispatch,
-    dispatch64_t::dispatch,
-    num_items,
-    (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
+  status = dispatch64_t::dispatch(
+    policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, static_cast<std::int64_t>(num_items));
   cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
 
   return output;

From 951c822f6f97b818b911fe178f24696ce8370015 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 20:20:44 +0200
Subject: [PATCH 7/9] Adds support for large number of items to
 `DeviceScan::*ByKey` family of algorithms (#2477)

* experimenting with bool tile state

* fixes perf regression from different tile state

* fixes support for large offset types

* adapts interface for scanbykey

* adds tests for large number of items for scanbykey

* fixes naming

* makes thrust scan_by_key use unsigned offset types

* moves scan_by_key_op to detail ns
---
 cub/cub/agent/agent_scan_by_key.cuh           |  33 ++--
 cub/cub/device/device_scan.cuh                |  68 +++++---
 .../device/dispatch/dispatch_scan_by_key.cuh  |  37 ++---
 cub/cub/thread/thread_operators.cuh           |  58 +++++++
 ...2_test_device_scan_by_key_large_offsets.cu | 157 ++++++++++++++++++
 .../thrust/system/cuda/detail/scan_by_key.h   |  16 +-
 6 files changed, 296 insertions(+), 73 deletions(-)
 create mode 100644 cub/test/catch2_test_device_scan_by_key_large_offsets.cu

diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh
index 438610303b2..6e79ca18d8c 100644
--- a/cub/cub/agent/agent_scan_by_key.cuh
+++ b/cub/cub/agent/agent_scan_by_key.cuh
@@ -142,11 +142,10 @@ struct AgentScanByKey
 
   using KeyT               = cub::detail::value_t<KeysInputIteratorT>;
   using InputT             = cub::detail::value_t<ValuesInputIteratorT>;
-  using SizeValuePairT     = KeyValuePair<OffsetT, AccumT>;
-  using KeyValuePairT      = KeyValuePair<KeyT, AccumT>;
-  using ReduceBySegmentOpT = ReduceBySegmentOp<ScanOpT>;
+  using FlagValuePairT     = KeyValuePair<int, AccumT>;
+  using ReduceBySegmentOpT = detail::ScanBySegmentOp<ScanOpT>;
 
-  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
+  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, int>;
 
   // Constants
   // Inclusive scan if no init_value type is provided
@@ -175,9 +174,9 @@ struct AgentScanByKey
 
   using DelayConstructorT = typename AgentScanByKeyPolicyT::detail::delay_constructor_t;
   using TilePrefixCallbackT =
-    TilePrefixCallbackOp<SizeValuePairT, ReduceBySegmentOpT, ScanTileStateT, 0, DelayConstructorT>;
+    TilePrefixCallbackOp<FlagValuePairT, ReduceBySegmentOpT, ScanTileStateT, 0, DelayConstructorT>;
 
-  using BlockScanT = BlockScan<SizeValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;
+  using BlockScanT = BlockScan<FlagValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;
 
   union TempStorage_
   {
@@ -216,14 +215,14 @@ struct AgentScanByKey
 
   // Exclusive scan specialization
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
-    SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, Int2Type<false> /* is_inclusive */)
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD], FlagValuePairT& tile_aggregate, Int2Type<false> /* is_inclusive */)
   {
     BlockScanT(storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
   }
 
   // Inclusive scan specialization
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
-    SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, Int2Type<true> /* is_inclusive */)
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD], FlagValuePairT& tile_aggregate, Int2Type<true> /* is_inclusive */)
   {
     BlockScanT(storage.scan_storage.scan).InclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
   }
@@ -234,8 +233,8 @@ struct AgentScanByKey
 
   // Exclusive scan specialization (with prefix from predecessors)
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
-    SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
-    SizeValuePairT& tile_aggregate,
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD],
+    FlagValuePairT& tile_aggregate,
     TilePrefixCallbackT& prefix_op,
     Int2Type<false> /* is_inclusive */)
   {
@@ -245,8 +244,8 @@ struct AgentScanByKey
 
   // Inclusive scan specialization (with prefix from predecessors)
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
-    SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
-    SizeValuePairT& tile_aggregate,
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD],
+    FlagValuePairT& tile_aggregate,
     TilePrefixCallbackT& prefix_op,
     Int2Type<true> /* is_inclusive */)
   {
@@ -263,7 +262,7 @@ struct AgentScanByKey
     OffsetT num_remaining,
     AccumT (&values)[ITEMS_PER_THREAD],
     OffsetT (&segment_flags)[ITEMS_PER_THREAD],
-    SizeValuePairT (&scan_items)[ITEMS_PER_THREAD])
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD])
   {
 // Zip values and segment_flags
 #pragma unroll
@@ -281,7 +280,7 @@ struct AgentScanByKey
   }
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  UnzipValues(AccumT (&values)[ITEMS_PER_THREAD], SizeValuePairT (&scan_items)[ITEMS_PER_THREAD])
+  UnzipValues(AccumT (&values)[ITEMS_PER_THREAD], FlagValuePairT (&scan_items)[ITEMS_PER_THREAD])
   {
 // Unzip values and segment_flags
 #pragma unroll
@@ -321,7 +320,7 @@ struct AgentScanByKey
     KeyT keys[ITEMS_PER_THREAD];
     AccumT values[ITEMS_PER_THREAD];
     OffsetT segment_flags[ITEMS_PER_THREAD];
-    SizeValuePairT scan_items[ITEMS_PER_THREAD];
+    FlagValuePairT scan_items[ITEMS_PER_THREAD];
 
     if (IS_LAST_TILE)
     {
@@ -359,7 +358,7 @@ struct AgentScanByKey
       ZipValuesAndFlags<IS_LAST_TILE>(num_remaining, values, segment_flags, scan_items);
 
       // Exclusive scan of values and segment_flags
-      SizeValuePairT tile_aggregate;
+      FlagValuePairT tile_aggregate;
       ScanTile(scan_items, tile_aggregate, Int2Type<IS_INCLUSIVE>());
 
       if (threadIdx.x == 0)
@@ -382,7 +381,7 @@ struct AgentScanByKey
       // Zip values and segment_flags
       ZipValuesAndFlags<IS_LAST_TILE>(num_remaining, values, segment_flags, scan_items);
 
-      SizeValuePairT tile_aggregate;
+      FlagValuePairT tile_aggregate;
       TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
       ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 27882e9ceed..8e42124d73b 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -1528,6 +1528,9 @@ struct DeviceScan
   //!   **[inferred]** Functor type having member
   //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -1558,21 +1561,22 @@ struct DeviceScan
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     KeysInputIteratorT d_keys_in,
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op = EqualityOpT(),
     cudaStream_t stream     = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
     using InitT   = cub::detail::value_t<ValuesInputIteratorT>;
 
     // Initial value
@@ -1601,14 +1605,15 @@ struct DeviceScan
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     KeysInputIteratorT d_keys_in,
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op,
     cudaStream_t stream,
     bool debug_synchronous)
@@ -1721,6 +1726,9 @@ struct DeviceScan
   //!   **[inferred]** Functor type having member
   //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //!  @param[in] d_temp_storage
   //!    Device-accessible allocation of temporary storage. When `nullptr`, the
   //!    required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -1761,7 +1769,8 @@ struct DeviceScan
             typename ValuesOutputIteratorT,
             typename ScanOpT,
             typename InitValueT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1770,14 +1779,14 @@ struct DeviceScan
     ValuesOutputIteratorT d_values_out,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op = EqualityOpT(),
     cudaStream_t stream     = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScanByKey<
       KeysInputIteratorT,
@@ -1804,7 +1813,8 @@ struct DeviceScan
             typename ValuesOutputIteratorT,
             typename ScanOpT,
             typename InitValueT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1813,7 +1823,7 @@ struct DeviceScan
     ValuesOutputIteratorT d_values_out,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op,
     cudaStream_t stream,
     bool debug_synchronous)
@@ -1904,6 +1914,9 @@ struct DeviceScan
   //!   **[inferred]** Functor type having member
   //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //!  @param[in] d_temp_storage
   //!    Device-accessible allocation of temporary storage.
   //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -1934,21 +1947,22 @@ struct DeviceScan
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     KeysInputIteratorT d_keys_in,
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op = EqualityOpT(),
     cudaStream_t stream     = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScanByKey<
       KeysInputIteratorT,
@@ -1973,14 +1987,15 @@ struct DeviceScan
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     KeysInputIteratorT d_keys_in,
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op,
     cudaStream_t stream,
     bool debug_synchronous)
@@ -2084,6 +2099,9 @@ struct DeviceScan
   //!   **[inferred]** Functor type having member
   //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //!  @param[in] d_temp_storage
   //!    Device-accessible allocation of temporary storage.
   //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -2118,7 +2136,8 @@ struct DeviceScan
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
             typename ScanOpT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -2126,14 +2145,14 @@ struct DeviceScan
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op = EqualityOpT(),
     cudaStream_t stream     = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScanByKey<
       KeysInputIteratorT,
@@ -2159,7 +2178,8 @@ struct DeviceScan
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
             typename ScanOpT,
-            typename EqualityOpT = Equality>
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -2167,7 +2187,7 @@ struct DeviceScan
     ValuesInputIteratorT d_values_in,
     ValuesOutputIteratorT d_values_out,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     EqualityOpT equality_op,
     cudaStream_t stream,
     bool debug_synchronous)
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index b1d295a6042..aa04ce9f2ec 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -170,19 +170,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
     .ConsumeRange(num_items, tile_state, start_tile);
 }
 
-template <typename ScanTileStateT, typename KeysInputIteratorT>
+template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
   ScanTileStateT tile_state,
   KeysInputIteratorT d_keys_in,
   cub::detail::value_t<KeysInputIteratorT>* d_keys_prev_in,
-  unsigned items_per_tile,
+  OffsetT items_per_tile,
   int num_tiles)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
 
-  const unsigned tid       = threadIdx.x + blockDim.x * blockIdx.x;
-  const unsigned tile_base = tid * items_per_tile;
+  const unsigned tid      = threadIdx.x + blockDim.x * blockIdx.x;
+  const OffsetT tile_base = static_cast<OffsetT>(tid) * items_per_tile;
 
   if (tid > 0 && tid < num_tiles)
   {
@@ -248,6 +248,9 @@ struct DispatchScanByKey : SelectedPolicy
   // The input value type
   using InputT = cub::detail::value_t<ValuesInputIteratorT>;
 
+  // Tile state used for the decoupled look-back
+  using ScanByKeyTileStateT = ReduceByKeyScanTileState<AccumT, int>;
+
   /// Device-accessible allocation of temporary storage. When `nullptr`, the
   /// required allocation size is written to `temp_storage_bytes` and no work
   /// is done.
@@ -373,8 +376,7 @@ struct DispatchScanByKey : SelectedPolicy
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
   {
-    using Policy              = typename ActivePolicyT::ScanByKeyPolicyT;
-    using ScanByKeyTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
+    using Policy = typename ActivePolicyT::ScanByKeyPolicyT;
 
     cudaError error = cudaSuccess;
     do
@@ -442,7 +444,7 @@ struct DispatchScanByKey : SelectedPolicy
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
-        .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles);
+        .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast<OffsetT>(tile_size), num_tiles);
 
       // Check for failure to launch
       error = CubDebug(cudaPeekAtLastError());
@@ -452,23 +454,12 @@ struct DispatchScanByKey : SelectedPolicy
       }
 
       // Sync the stream if specified to flush runtime errors
-
       error = CubDebug(detail::DebugSyncStream(stream));
       if (cudaSuccess != error)
       {
         break;
       }
 
-      // Get SM occupancy for scan_kernel
-      int scan_sm_occupancy;
-      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
-                                      scan_kernel,
-                                      Policy::BLOCK_THREADS));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
       // Get max x-dimension of grid
       int max_dim_x;
       error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
@@ -484,13 +475,12 @@ struct DispatchScanByKey : SelectedPolicy
 // Log scan_kernel configuration
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
         _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
-                "per thread, %d SM occupancy\n",
+                "per thread\n",
                 start_tile,
                 scan_grid_size,
                 Policy::BLOCK_THREADS,
                 (long long) stream,
-                Policy::ITEMS_PER_THREAD,
-                scan_sm_occupancy);
+                Policy::ITEMS_PER_THREAD);
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke scan_kernel
@@ -529,12 +519,11 @@ struct DispatchScanByKey : SelectedPolicy
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
   {
-    using MaxPolicyT          = typename DispatchScanByKey::MaxPolicy;
-    using ScanByKeyTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
+    using MaxPolicyT = typename DispatchScanByKey::MaxPolicy;
 
     // Ensure kernels are instantiated.
     return Invoke<ActivePolicyT>(
-      DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT>,
+      DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT, OffsetT>,
       DeviceScanByKeyKernel<MaxPolicyT,
                             KeysInputIteratorT,
                             ValuesInputIteratorT,
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 4df4b49ac07..0475bdba7f4 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -217,6 +217,64 @@ struct ArgMin
 
 namespace detail
 {
+template <typename ScanOpT>
+struct ScanBySegmentOp
+{
+  /// Wrapped operator
+  ScanOpT op;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanBySegmentOp() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanBySegmentOp(ScanOpT op)
+      : op(op)
+  {}
+
+  /**
+   * @brief Scan operator
+   *
+   * @tparam KeyValuePairT
+   *   KeyValuePair pairing of T (value) and int (head flag)
+   *
+   * @param[in] first
+   *   First partial reduction
+   *
+   * @param[in] second
+   *   Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second)
+  {
+    KeyValuePairT retval;
+    retval.key = first.key | second.key;
+#ifdef _NVHPC_CUDA // WAR bug on nvc++
+    if (second.key)
+    {
+      retval.value = second.value;
+    }
+    else
+    {
+      // If second.value isn't copied into a temporary here, nvc++ will
+      // crash while compiling the TestScanByKeyWithLargeTypes test in
+      // thrust/testing/scan_by_key.cu:
+      auto v2      = second.value;
+      retval.value = op(first.value, v2);
+    }
+#else // not nvc++:
+    // if (second.key) {
+    //   The second partial reduction spans a segment reset, so it's value
+    //   aggregate becomes the running aggregate
+    // else {
+    //   The second partial reduction does not span a reset, so accumulate both
+    //   into the running aggregate
+    // }
+    retval.value = (second.key) ? second.value : op(first.value, second.value);
+#endif
+    return retval;
+  }
+};
+
 template <class OpT>
 struct basic_binary_op_t
 {
diff --git a/cub/test/catch2_test_device_scan_by_key_large_offsets.cu b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
new file mode 100644
index 00000000000..3e97ff23219
--- /dev/null
+++ b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
@@ -0,0 +1,157 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+// List of offset types to be used for testing large number of items
+// using offset_types = c2h::type_list<std::uint32_t, std::uint64_t>;
+using offset_types = c2h::type_list<std::uint64_t>;
+
+template <typename ItemT, bool IsExclusive>
+struct expected_sum_op
+{
+  uint64_t segment_size;
+  ItemT init_value;
+
+  __host__ __device__ __forceinline__ ItemT operator()(const uint64_t index) const
+  {
+    uint64_t index_within_segment = index % segment_size;
+    uint64_t full_segments        = index / segment_size;
+
+    uint64_t sum_within_partial_segment{};
+    if (IsExclusive)
+    {
+      sum_within_partial_segment = (index_within_segment * (index_within_segment - 1)) / 2;
+    }
+    else
+    {
+      sum_within_partial_segment = (index_within_segment * (index_within_segment + 1)) / 2;
+    }
+    return index_within_segment == 0
+           ? (IsExclusive ? init_value : init_value + full_segments)
+           : static_cast<ItemT>(sum_within_partial_segment + full_segments) + init_value;
+  }
+};
+
+template <typename ItemT>
+struct mod_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ uint64_t operator()(const uint64_t index) const
+  {
+    auto mod = static_cast<ItemT>(index % segment_size);
+    auto div = static_cast<ItemT>(index / segment_size);
+    return mod == 0 ? div : mod;
+  }
+};
+
+template <typename KeyT>
+struct div_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ KeyT operator()(const uint64_t index) const
+  {
+    return static_cast<KeyT>(index / segment_size);
+  }
+};
+
+CUB_TEST("DeviceScan::ScanByKey works for very large number of items", "[by_key][scan][device]", offset_types)
+try
+{
+  using op_t     = cub::Sum;
+  using item_t   = std::uint32_t;
+  using key_t    = std::uint64_t;
+  using index_t  = std::uint64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // Prepare input (generate a series of: 0, 1, 2, ..., <segment_size-1>,  1, 1, 2, ..., <segment_size-1>, 2, 1, ...)
+  const index_t segment_size = GENERATE_COPY(values({offset_t{1000}, offset_t{1}}));
+  auto index_it              = thrust::make_counting_iterator(index_t{});
+  auto keys_it               = thrust::make_transform_iterator(index_it, div_op<key_t>{segment_size});
+  auto items_it              = thrust::make_transform_iterator(index_it, mod_op<item_t>{segment_size});
+
+  // Output memory allocation
+  c2h::device_vector<item_t> d_items_out(num_items);
+  auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data());
+
+  // Run test
+  SECTION("ExclusiveScanByKey")
+  {
+    constexpr bool is_exclusive = true;
+    auto initial_value          = item_t{42};
+    device_exclusive_scan_by_key(keys_it, items_it, d_items_out_it, op_t{}, initial_value, num_items, cub::Equality{});
+
+    // Ensure that we created the correct output
+    auto expected_out_it = thrust::make_transform_iterator(
+      index_it, expected_sum_op<item_t, is_exclusive>{static_cast<index_t>(segment_size), initial_value});
+    bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it);
+    REQUIRE(all_results_correct == true);
+  }
+  SECTION("InclusiveScanByKey")
+  {
+    constexpr bool is_exclusive = false;
+    auto initial_value          = item_t{0};
+    device_inclusive_scan_by_key(keys_it, items_it, d_items_out_it, op_t{}, num_items, cub::Equality{});
+
+    // Ensure that we created the correct output
+    auto expected_out_it = thrust::make_transform_iterator(
+      index_it, expected_sum_op<item_t, is_exclusive>{static_cast<index_t>(segment_size), initial_value});
+    bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it);
+    REQUIRE(all_results_correct == true);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h
index 6c64d223a98..5eab529e50d 100644
--- a/thrust/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/thrust/system/cuda/detail/scan_by_key.h
@@ -102,7 +102,7 @@ _CCCL_HOST_DEVICE ValuesOutIt inclusive_scan_by_key_n(
     EqualityOpT,
     ScanOpT,
     cub::NullType,
-    std::int32_t,
+    std::uint32_t,
     AccumT>;
   using Dispatch64 = cub::DispatchScanByKey<
     KeysInUnwrapIt,
@@ -111,7 +111,7 @@ _CCCL_HOST_DEVICE ValuesOutIt inclusive_scan_by_key_n(
     EqualityOpT,
     ScanOpT,
     cub::NullType,
-    std::int64_t,
+    std::uint64_t,
     AccumT>;
 
   cudaStream_t stream = thrust::cuda_cub::stream(policy);
@@ -120,7 +120,7 @@ _CCCL_HOST_DEVICE ValuesOutIt inclusive_scan_by_key_n(
   // Determine temporary storage requirements:
   std::size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -146,7 +146,7 @@ _CCCL_HOST_DEVICE ValuesOutIt inclusive_scan_by_key_n(
     // Allocate temporary storage:
     thrust::detail::temporary_array<std::uint8_t, Derived> tmp{policy, tmp_size};
 
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -211,7 +211,7 @@ _CCCL_HOST_DEVICE ValuesOutIt exclusive_scan_by_key_n(
     EqualityOpT,
     ScanOpT,
     InitValueT,
-    std::int32_t,
+    std::uint32_t,
     InitValueT>;
   using Dispatch64 = cub::DispatchScanByKey<
     KeysInUnwrapIt,
@@ -220,7 +220,7 @@ _CCCL_HOST_DEVICE ValuesOutIt exclusive_scan_by_key_n(
     EqualityOpT,
     ScanOpT,
     InitValueT,
-    std::int64_t,
+    std::uint64_t,
     InitValueT>;
 
   cudaStream_t stream = thrust::cuda_cub::stream(policy);
@@ -229,7 +229,7 @@ _CCCL_HOST_DEVICE ValuesOutIt exclusive_scan_by_key_n(
   // Determine temporary storage requirements:
   std::size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -255,7 +255,7 @@ _CCCL_HOST_DEVICE ValuesOutIt exclusive_scan_by_key_n(
     // Allocate temporary storage:
     thrust::detail::temporary_array<std::uint8_t, Derived> tmp{policy, tmp_size};
 
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,

From e149e8623792a915d2b4fe63590c4794b0b1e271 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 9 Oct 2024 14:18:03 -0400
Subject: [PATCH 8/9] Integrate c/parallel with CCCL build system and CI.
 (#2514)

Integrate c/parallel into CCCL, setup CI, etc.
---
 CMakePresets.json                             | 20 +++++++
 c/CMakeLists.txt                              | 26 +--------
 c/parallel/CMakeLists.txt                     | 53 +++++++++++++++++++
 c/parallel/cmake/CParallelHeaderTesting.cmake | 11 ++++
 c/{ => parallel}/include/cccl/c/for.h         |  0
 c/{ => parallel}/include/cccl/c/reduce.h      |  0
 c/{ => parallel}/include/cccl/c/types.h       |  0
 c/{ => parallel}/src/for.cu                   |  2 +-
 c/{ => parallel}/src/for/for_op_helper.cpp    |  1 -
 c/{ => parallel}/src/for/for_op_helper.h      |  0
 c/{ => parallel}/src/reduce.cu                |  6 +--
 c/{ => parallel}/src/util/context.cpp         |  0
 c/{ => parallel}/src/util/context.h           |  0
 c/{ => parallel}/src/util/errors.cpp          |  0
 c/{ => parallel}/src/util/errors.h            |  0
 c/{ => parallel}/src/util/types.cpp           |  0
 c/{ => parallel}/src/util/types.h             |  0
 c/parallel/test/CMakeLists.txt                | 40 ++++++++++++++
 c/{ => parallel}/test/c2h.h                   |  0
 c/{ => parallel}/test/test_for.cpp            |  0
 c/{ => parallel}/test/test_main.cpp           |  0
 c/{ => parallel}/test/test_reduce.cpp         | 13 ++---
 c/test/CMakeLists.txt                         | 17 ------
 ci/build_cccl_c_parallel.sh                   | 15 ++++++
 ci/inspect_changes.sh                         |  9 ++--
 ci/matrix.yaml                                |  7 ++-
 ci/test_cccl_c_parallel.sh                    | 13 +++++
 .../cuda/parallel/experimental/__init__.py    |  2 +-
 python/cuda_parallel/setup.py                 |  6 +--
 29 files changed, 177 insertions(+), 64 deletions(-)
 create mode 100644 c/parallel/CMakeLists.txt
 create mode 100644 c/parallel/cmake/CParallelHeaderTesting.cmake
 rename c/{ => parallel}/include/cccl/c/for.h (100%)
 rename c/{ => parallel}/include/cccl/c/reduce.h (100%)
 rename c/{ => parallel}/include/cccl/c/types.h (100%)
 rename c/{ => parallel}/src/for.cu (99%)
 rename c/{ => parallel}/src/for/for_op_helper.cpp (98%)
 rename c/{ => parallel}/src/for/for_op_helper.h (100%)
 rename c/{ => parallel}/src/reduce.cu (98%)
 rename c/{ => parallel}/src/util/context.cpp (100%)
 rename c/{ => parallel}/src/util/context.h (100%)
 rename c/{ => parallel}/src/util/errors.cpp (100%)
 rename c/{ => parallel}/src/util/errors.h (100%)
 rename c/{ => parallel}/src/util/types.cpp (100%)
 rename c/{ => parallel}/src/util/types.h (100%)
 create mode 100644 c/parallel/test/CMakeLists.txt
 rename c/{ => parallel}/test/c2h.h (100%)
 rename c/{ => parallel}/test/test_for.cpp (100%)
 rename c/{ => parallel}/test/test_main.cpp (100%)
 rename c/{ => parallel}/test/test_reduce.cpp (96%)
 delete mode 100644 c/test/CMakeLists.txt
 create mode 100755 ci/build_cccl_c_parallel.sh
 create mode 100755 ci/test_cccl_c_parallel.sh

diff --git a/CMakePresets.json b/CMakePresets.json
index 9c28e374ce2..7d611714064 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -21,6 +21,7 @@
         "CCCL_ENABLE_CUDAX": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
+        "CCCL_ENABLE_C": false,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
         "Thrust_ENABLE_INSTALL_RULES": true,
@@ -314,6 +315,16 @@
         "cudax_ENABLE_DIALECT_CPP20": true
       }
     },
+    {
+      "name": "cccl-c-parallel",
+      "displayName" : "CCCL C Parallel Library",
+      "inherits": "base",
+      "cacheVariables": {
+        "CCCL_ENABLE_C": true,
+        "CCCL_C_Parallel_ENABLE_TESTING": true,
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
+      }
+    },
     {
       "name": "cccl-infra",
       "displayName": "CCCL Infrastructure",
@@ -443,6 +454,10 @@
       "name": "cudax-cpp20",
       "configurePreset": "cudax-cpp20"
     },
+    {
+      "name": "cccl-c-parallel",
+      "configurePreset": "cccl-c-parallel"
+    },
     {
       "name": "cccl-infra",
       "configurePreset": "cccl-infra"
@@ -808,6 +823,11 @@
       "configurePreset": "cudax-cpp20",
       "inherits": "cudax-base"
     },
+    {
+      "name": "cccl-c-parallel",
+      "configurePreset": "cccl-c-parallel",
+      "inherits": "base"
+    },
     {
       "name": "cccl-infra",
       "configurePreset": "cccl-infra",
diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index e9761c33f2d..7f1dbf4507b 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1,25 +1 @@
-cmake_minimum_required(VERSION 3.30)
-
-project(cccl.c LANGUAGES CUDA CXX)
-
-add_library(cccl.c SHARED
-    src/reduce.cu src/for.cu
-    src/for/for_op_helper.cpp
-    src/util/errors.cpp src/util/types.cpp src/util/context.cpp)
-
-set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
-set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)
-
-find_package(CUDAToolkit REQUIRED)
-
-# TODO Use static versions of cudart, nvrtc, and nvJitLink
-target_link_libraries(cccl.c PRIVATE CUDA::cudart
-                                     CUDA::nvrtc
-                                     CUDA::nvJitLink
-                                     CUDA::cuda_driver)
-target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
-target_include_directories(cccl.c PUBLIC "include")
-target_include_directories(cccl.c PRIVATE "src")
-
-add_subdirectory(test)
+add_subdirectory(parallel)
diff --git a/c/parallel/CMakeLists.txt b/c/parallel/CMakeLists.txt
new file mode 100644
index 00000000000..0115d9d64c6
--- /dev/null
+++ b/c/parallel/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(CCCL_C_Parallel LANGUAGES CUDA CXX)
+
+option(CCCL_C_Parallel_ENABLE_TESTING "Build CUDA Experimental's tests." OFF)
+option(CCCL_C_Parallel_ENABLE_HEADER_TESTING "Build CUDA Experimental's standalone headers." OFF)
+
+# FIXME Ideally this would be handled by presets and install rules, but for now
+# consumers may override this to control the target location of cccl.c.parallel.
+set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.parallel library")
+mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+
+file(GLOB_RECURSE srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  "src/*.cu" "src/*.cpp"
+)
+
+add_library(cccl.c.parallel SHARED ${srcs})
+set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON)
+cccl_configure_target(cccl.c.parallel DIALECT 20)
+
+# Override the properties set by cccl_configure_target:
+if (CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+  set_target_properties(cccl.c.parallel PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
+    ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
+  )
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# TODO Use static versions of cudart, nvrtc, and nvJitLink
+target_link_libraries(cccl.c.parallel PRIVATE
+  CUDA::cudart
+  CUDA::nvrtc
+  CUDA::nvJitLink
+  CUDA::cuda_driver
+  cccl.compiler_interface_cpp20
+)
+target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1)
+target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1)
+
+target_include_directories(cccl.c.parallel PUBLIC "include")
+target_include_directories(cccl.c.parallel PRIVATE "src")
+
+if (CCCL_C_Parallel_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
+  include(cmake/CParallelHeaderTesting.cmake)
+endif()
diff --git a/c/parallel/cmake/CParallelHeaderTesting.cmake b/c/parallel/cmake/CParallelHeaderTesting.cmake
new file mode 100644
index 00000000000..bbb938ab815
--- /dev/null
+++ b/c/parallel/cmake/CParallelHeaderTesting.cmake
@@ -0,0 +1,11 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+cccl_generate_header_tests(cccl.c.parallel.headers c/parallel/include
+  DIALECT 20
+  GLOBS "cccl/c/*.h"
+)
+target_link_libraries(cccl.c.parallel.headers PUBLIC cccl.c.parallel)
diff --git a/c/include/cccl/c/for.h b/c/parallel/include/cccl/c/for.h
similarity index 100%
rename from c/include/cccl/c/for.h
rename to c/parallel/include/cccl/c/for.h
diff --git a/c/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h
similarity index 100%
rename from c/include/cccl/c/reduce.h
rename to c/parallel/include/cccl/c/reduce.h
diff --git a/c/include/cccl/c/types.h b/c/parallel/include/cccl/c/types.h
similarity index 100%
rename from c/include/cccl/c/types.h
rename to c/parallel/include/cccl/c/types.h
diff --git a/c/src/for.cu b/c/parallel/src/for.cu
similarity index 99%
rename from c/src/for.cu
rename to c/parallel/src/for.cu
index 2f46c3843d4..4fa32a3e32b 100644
--- a/c/src/for.cu
+++ b/c/parallel/src/for.cu
@@ -32,7 +32,7 @@ using OffsetT = unsigned long long;
 static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
 
 static cudaError_t
-Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int cc, CUfunction static_kernel, CUstream stream)
+Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int /*cc*/, CUfunction static_kernel, CUstream stream)
 {
   cudaError error = cudaSuccess;
 
diff --git a/c/src/for/for_op_helper.cpp b/c/parallel/src/for/for_op_helper.cpp
similarity index 98%
rename from c/src/for/for_op_helper.cpp
rename to c/parallel/src/for/for_op_helper.cpp
index b7f4b1e8ae5..247bfbff2b0 100644
--- a/c/src/for/for_op_helper.cpp
+++ b/c/parallel/src/for/for_op_helper.cpp
@@ -201,7 +201,6 @@ for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterat
 {
   // Iterator is either a pointer or a stateful object, allocate space according to its size or alignment
   size_t iter_size     = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.size : sizeof(void*);
-  size_t iter_align    = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.alignment : alignof(void*);
   void* iterator_state = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.state : &iterator.state;
 
   // Do we need to valid user input? Alignments larger than the provided size?
diff --git a/c/src/for/for_op_helper.h b/c/parallel/src/for/for_op_helper.h
similarity index 100%
rename from c/src/for/for_op_helper.h
rename to c/parallel/src/for/for_op_helper.h
diff --git a/c/src/reduce.cu b/c/parallel/src/reduce.cu
similarity index 98%
rename from c/src/reduce.cu
rename to c/parallel/src/reduce.cu
index 97f8793bb78..a8d111d65af 100644
--- a/c/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -64,7 +64,7 @@ static reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
   return tunings[N - 1];
 }
 
-static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
+static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info /*input_type*/)
 {
   reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}};
 
@@ -77,7 +77,7 @@ static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type,
   return {block_size, items_per_thread, vector_load_length};
 }
 
-static cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
 {
   // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
   //      so switching back to the old accumulator type logic for now
@@ -254,7 +254,7 @@ static cudaError_t Invoke(
   runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);
 
   // Force kernel code-generation in all compiler passes
-  if (num_items <= (policy.block_size * policy.items_per_thread))
+  if (num_items <= static_cast<OffsetT>(policy.block_size * policy.items_per_thread))
   {
     // Small, single tile size
     return InvokeSingleTile(
diff --git a/c/src/util/context.cpp b/c/parallel/src/util/context.cpp
similarity index 100%
rename from c/src/util/context.cpp
rename to c/parallel/src/util/context.cpp
diff --git a/c/src/util/context.h b/c/parallel/src/util/context.h
similarity index 100%
rename from c/src/util/context.h
rename to c/parallel/src/util/context.h
diff --git a/c/src/util/errors.cpp b/c/parallel/src/util/errors.cpp
similarity index 100%
rename from c/src/util/errors.cpp
rename to c/parallel/src/util/errors.cpp
diff --git a/c/src/util/errors.h b/c/parallel/src/util/errors.h
similarity index 100%
rename from c/src/util/errors.h
rename to c/parallel/src/util/errors.h
diff --git a/c/src/util/types.cpp b/c/parallel/src/util/types.cpp
similarity index 100%
rename from c/src/util/types.cpp
rename to c/parallel/src/util/types.cpp
diff --git a/c/src/util/types.h b/c/parallel/src/util/types.h
similarity index 100%
rename from c/src/util/types.h
rename to c/parallel/src/util/types.h
diff --git a/c/parallel/test/CMakeLists.txt b/c/parallel/test/CMakeLists.txt
new file mode 100644
index 00000000000..fae1160dec0
--- /dev/null
+++ b/c/parallel/test/CMakeLists.txt
@@ -0,0 +1,40 @@
+cccl_get_catch2()
+
+function(cccl_c_parallel_add_test target_name_var source)
+  string(REGEX REPLACE "test_([^.]*)" "cccl.c.parallel.test.\\1" target_name "${source}")
+  set(target_name_var ${target_name} PARENT_SCOPE)
+
+  add_executable(${target_name}
+    "${source}"
+    test_main.cpp
+  )
+  cccl_configure_target(${target_name} DIALECT 20)
+
+  target_link_libraries(${target_name} PRIVATE
+    cccl.c.parallel
+    CUDA::cudart
+    CUDA::nvrtc
+    Catch2::Catch2
+    cccl.compiler_interface_cpp20
+  )
+
+  target_compile_definitions(${target_name} PRIVATE
+    TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+    TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
+    TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+    TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
+  )
+
+  add_test(NAME ${target_name} COMMAND ${target_name})
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+list(REMOVE_ITEM test_srcs test_main.cpp)
+
+foreach(test_src IN LISTS test_srcs)
+  cccl_c_parallel_add_test(test_target "${test_src}")
+endforeach()
diff --git a/c/test/c2h.h b/c/parallel/test/c2h.h
similarity index 100%
rename from c/test/c2h.h
rename to c/parallel/test/c2h.h
diff --git a/c/test/test_for.cpp b/c/parallel/test/test_for.cpp
similarity index 100%
rename from c/test/test_for.cpp
rename to c/parallel/test/test_for.cpp
diff --git a/c/test/test_main.cpp b/c/parallel/test/test_main.cpp
similarity index 100%
rename from c/test/test_main.cpp
rename to c/parallel/test/test_main.cpp
diff --git a/c/test/test_reduce.cpp b/c/parallel/test/test_reduce.cpp
similarity index 96%
rename from c/test/test_reduce.cpp
rename to c/parallel/test/test_reduce.cpp
index c98f350390a..74f00c09507 100644
--- a/c/test/test_reduce.cpp
+++ b/c/parallel/test/test_reduce.cpp
@@ -48,7 +48,7 @@ void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long nu
 using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
 TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types)
 {
-  const int num_items               = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+  const std::size_t num_items       = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
   operation_t op                    = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
   const std::vector<TestType> input = generate<TestType>(num_items);
   pointer_t<TestType> input_ptr(input);
@@ -70,7 +70,7 @@ struct pair
 
 TEST_CASE("Reduce works with custom types", "[reduce]")
 {
-  const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+  const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
 
   operation_t op = make_operation(
     "op",
@@ -204,8 +204,9 @@ TEST_CASE("Reduce works with input and output iterators", "[reduce]")
 
 TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
 {
-  const int num_items = 1 << 14; // 16384 > 128
-  operation_t op      = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
+  const std::size_t num_items = 1 << 14; // 16384 > 128
+
+  operation_t op = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
   iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
     "struct constant_iterator_state_t { char value; };\n",
     {"in_advance",
@@ -221,8 +222,8 @@ TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
 
   reduce(input_it, output_it, num_items, op, init);
 
-  const size_t output = output_it[0];
-  const int expected  = init.value + num_items;
+  const size_t output   = output_it[0];
+  const size_t expected = init.value + num_items;
   REQUIRE(output == expected);
 }
 
diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt
deleted file mode 100644
index a9223faa4dc..00000000000
--- a/c/test/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp)
-add_executable(cccl.c.test.for test_for.cpp test_main.cpp)
-
-target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
-target_link_libraries(cccl.c.test.for PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
-
-target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL
-                                                     TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
-                                                     TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
-                                                     TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
-                                                     TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
-
-target_compile_definitions(cccl.c.test.for PRIVATE CCCL_C_EXPERIMENTAL
-                                                     TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
-                                                     TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
-                                                     TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
-                                                     TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
diff --git a/ci/build_cccl_c_parallel.sh b/ci/build_cccl_c_parallel.sh
new file mode 100755
index 00000000000..5c59815fe14
--- /dev/null
+++ b/ci/build_cccl_c_parallel.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+PRESET="cccl-c-parallel"
+
+CMAKE_OPTIONS=""
+
+configure_and_build_preset "CCCL C Parallel Library" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
index 72c37ba9c57..ddf25e6260e 100755
--- a/ci/inspect_changes.sh
+++ b/ci/inspect_changes.sh
@@ -27,7 +27,7 @@ subprojects=(
   thrust
   cudax
   pycuda
-  c
+  cccl_c_parallel
 )
 
 # ...and their dependencies:
@@ -37,8 +37,8 @@ declare -A dependencies=(
   [cub]="cccl libcudacxx thrust"
   [thrust]="cccl libcudacxx cub"
   [cudax]="cccl libcudacxx"
-  [pycuda]="cccl libcudacxx cub thrust c"
-  [c]="cccl libcudacxx cub"
+  [pycuda]="cccl libcudacxx cub thrust cccl_c_parallel"
+  [cccl_c_parallel]="cccl libcudacxx cub thrust"
 )
 
 declare -A project_names=(
@@ -48,7 +48,7 @@ declare -A project_names=(
   [thrust]="Thrust"
   [cudax]="CUDA Experimental"
   [pycuda]="pycuda"
-  [c]="CUDA C Core Library "
+  [cccl_c_parallel]="CCCL C Parallel Library"
 )
 
 # By default, the project directory is assumed to be the same as the subproject name,
@@ -56,6 +56,7 @@ declare -A project_names=(
 # of any subproject directory.
 declare -A project_dirs=(
   [pycuda]="python/cuda_cooperative"
+  [cccl_c_parallel]="c/parallel"
 )
 
 # Usage checks:
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 174c912d0b2..ae1ab5671d1 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -46,8 +46,8 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc12']}
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'max', cxx: ['clang14']}
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'max', cxx: ['clang18']}
-    # Python jobs:
-    - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
+    # Python and c/parallel jobs:
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'pycuda'], ctk: '12.5'}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6',  'clang9']}
     - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']}
@@ -233,6 +233,9 @@ projects:
   pycuda:
     name: "cuda (python)"
     job_map: { build: [], test: ['test_nobuild'] }
+  cccl_c_parallel:
+    name: 'CCCL C Parallel'
+    stds: [20]
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
diff --git a/ci/test_cccl_c_parallel.sh b/ci/test_cccl_c_parallel.sh
new file mode 100755
index 00000000000..852869cc1af
--- /dev/null
+++ b/ci/test_cccl_c_parallel.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+./build_cccl_c_parallel.sh "$@"
+
+PRESET="cccl-c-parallel"
+
+test_preset "CCCL C Parallel Library" ${PRESET}
+
+print_time_summary
diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
index 0fa2d09df11..36813a737a9 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -146,7 +146,7 @@ def _get_bindings():
     if _bindings is None:
         include_path = importlib.resources.files(
             'cuda.parallel.experimental').joinpath('cccl')
-        cccl_c_path = os.path.join(include_path, 'libcccl.c.so')
+        cccl_c_path = os.path.join(include_path, 'libcccl.c.parallel.so')
         _bindings = ctypes.CDLL(cccl_c_path)
         _bindings.cccl_device_reduce.restype = ctypes.c_int
         _bindings.cccl_device_reduce.restype = ctypes.c_int
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index c29a5237fc0..3a25f7d89d1 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -77,10 +77,8 @@ def build_extension(self, ext):
         extdir = os.path.abspath(os.path.dirname(
             self.get_ext_fullpath(ext.name)))
         cmake_args = [
-            '-DCCCL_ENABLE_CUB=YES',
-            '-DCCCL_ENABLE_THRUST=YES',
             '-DCCCL_ENABLE_C=YES',
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+            '-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
             '-DCMAKE_BUILD_TYPE=Release',
         ]
 
@@ -90,7 +88,7 @@ def build_extension(self, ext):
         subprocess.check_call(['cmake', cccl_path] +
                               cmake_args, cwd=self.build_temp)
         subprocess.check_call(
-            ['cmake', '--build', '.', '--target', 'cccl.c'], cwd=self.build_temp)
+            ['cmake', '--build', '.', '--target', 'cccl.c.parallel'], cwd=self.build_temp)
 
 
 setup(

From 87ef1d54b9041fd824fa3ce5480756662dd675be Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Wed, 9 Oct 2024 15:10:36 -0700
Subject: [PATCH 9/9] Create a command list utility for nvrtc/jitlink steps.
 (#2511)

* Replace nvrtc/jitlink steps with a state-machine

* Helpful for porting new kernels as intellisense guides the developer.
* Enforces a rigid structure when writing new code.

* Fix errors introduced by added warning levels, move header to correct location

* Fix several malloc/delete|new/free mixups

* Use array syntax for unique_ptr<char[]> in cleanup

* Use array syntax for unique_ptr<char[]> in nvrtc_cubin
---
 c/parallel/src/for.cu                |  88 +++-------
 c/parallel/src/for/for_op_helper.cpp |   5 +-
 c/parallel/src/for/for_op_helper.h   |  16 +-
 c/parallel/src/nvrtc/command_list.h  | 233 +++++++++++++++++++++++++++
 c/parallel/src/reduce.cu             | 134 ++++++---------
 5 files changed, 310 insertions(+), 166 deletions(-)
 create mode 100644 c/parallel/src/nvrtc/command_list.h

diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu
index 4fa32a3e32b..d3935d8f269 100644
--- a/c/parallel/src/for.cu
+++ b/c/parallel/src/for.cu
@@ -13,14 +13,12 @@
 #include <cub/util_device.cuh>
 
 #include <format>
-#include <iostream>
 #include <type_traits>
 
 #include <cccl/c/for.h>
 #include <cccl/c/types.h>
 #include <for/for_op_helper.h>
-#include <nvJitLink.h>
-#include <nvrtc.h>
+#include <nvrtc/command_list.h>
 #include <util/context.h>
 #include <util/errors.h>
 #include <util/types.h>
@@ -87,7 +85,6 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
       throw std::runtime_error(std::string("Iterators are unsupported in for_each currently"));
     }
 
-    nvrtcProgram prog{};
     const char* name = "test";
 
     const int cc                     = cc_major * 10 + cc_minor;
@@ -97,86 +94,49 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
     const std::string for_kernel_name   = get_device_for_kernel_name();
     const std::string device_for_kernel = get_for_kernel(op, d_data);
 
-    check(nvrtcCreateProgram(&prog, device_for_kernel.c_str(), name, 0, nullptr, nullptr));
-
-    check(nvrtcAddNameExpression(prog, for_kernel_name.c_str()));
-
     const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
 
-    constexpr int num_args = 7;
-    const char* args[]     = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
-
-    std::size_t log_size{};
-    nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args);
-
-    check(nvrtcGetProgramLogSize(prog, &log_size));
-    std::unique_ptr<char[]> log{new char[log_size]};
-    check(nvrtcGetProgramLog(prog, log.get()));
+    constexpr size_t num_args  = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
 
-    if (log_size > 1)
-    {
-      std::cerr << log.get() << std::endl;
-    }
+    constexpr size_t num_lto_args   = 2;
+    const char* lopts[num_lto_args] = {"-lto", arch.c_str()};
 
-    std::string for_kernel_lowered_name;
-    {
-      const char* for_kernel_lowered_name_temp;
-      check(nvrtcGetLoweredName(prog, for_kernel_name.c_str(), &for_kernel_lowered_name_temp));
-      for_kernel_lowered_name = for_kernel_lowered_name_temp;
-    }
+    std::string lowered_name;
 
-    check(compile_result);
+    auto cl =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{device_for_kernel, name})
+        .add_expression({for_kernel_name})
+        .compile_program({args, num_args})
+        .get_name({for_kernel_name, lowered_name})
+        .cleanup_program()
+        .add_link({op.ltoir, op.ltoir_size});
 
-    std::size_t ltoir_size{};
-    check(nvrtcGetLTOIRSize(prog, &ltoir_size));
-    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
-    check(nvrtcGetLTOIR(prog, ltoir.get()));
-    check(nvrtcDestroyProgram(&prog));
+    nvrtc_cubin result{};
 
-    nvJitLinkHandle handle;
-    const char* lopts[] = {"-lto", arch.c_str()};
-
-    check(nvJitLinkCreate(&handle, 2, lopts));
-    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name));
-    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name));
     if (cccl_iterator_kind_t::iterator == d_data.type)
     {
-      check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, d_data.advance.ltoir, d_data.advance.ltoir_size, name));
-      check(
-        nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, d_data.dereference.ltoir, d_data.dereference.ltoir_size, name));
+      result = cl.add_link({d_data.advance.ltoir, d_data.advance.ltoir_size})
+                 .add_link({d_data.dereference.ltoir, d_data.dereference.ltoir_size})
+                 .finalize_program(num_lto_args, lopts);
     }
-
-    auto jitlink_error = nvJitLinkComplete(handle);
-
-    check(nvJitLinkGetErrorLogSize(handle, &log_size));
-    std::unique_ptr<char[]> jitlinklog{new char[log_size]};
-    check(nvJitLinkGetErrorLog(handle, jitlinklog.get()));
-
-    if (log_size > 1)
+    else
     {
-      std::cerr << jitlinklog.get() << std::endl;
+      result = cl.finalize_program(num_lto_args, lopts);
     }
 
-    check(jitlink_error);
-
-    std::size_t cubin_size{};
-    check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
-    std::unique_ptr<char[]> cubin{new char[cubin_size]};
-    check(nvJitLinkGetLinkedCubin(handle, cubin.get()));
-    check(nvJitLinkDestroy(&handle));
-
-    cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
-    check(cuLibraryGetKernel(&build->static_kernel, build->library, for_kernel_lowered_name.c_str()));
+    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = cubin.release();
-    build->cubin_size = cubin_size;
+    build->cubin      = (void*) result.cubin.release();
+    build->cubin_size = result.size;
   }
   catch (...)
   {
     error = CUDA_ERROR_UNKNOWN;
   }
-
   return error;
 }
 
diff --git a/c/parallel/src/for/for_op_helper.cpp b/c/parallel/src/for/for_op_helper.cpp
index 247bfbff2b0..9d4a4e0287c 100644
--- a/c/parallel/src/for/for_op_helper.cpp
+++ b/c/parallel/src/for/for_op_helper.cpp
@@ -11,6 +11,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <format>
+#include <memory>
 #include <string>
 #include <string_view>
 #include <type_traits>
@@ -218,7 +219,7 @@ for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterat
   if (use_allocated_storage)
   {
     // Allocate required space
-    iter_start = (char*) malloc(min_size);
+    iter_start = new char[min_size];
   }
 
   // Memcpy into either local or allocated buffer
@@ -232,7 +233,7 @@ for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterat
   // Return either local buffer or unique_ptr
   if (use_allocated_storage)
   {
-    return for_each_kernel_state{unique_void{(void*) iter_start}, user_op_offset};
+    return for_each_kernel_state{std::unique_ptr<char[]>{iter_start}, user_op_offset};
   }
   else
   {
diff --git a/c/parallel/src/for/for_op_helper.h b/c/parallel/src/for/for_op_helper.h
index c88351ca86f..abedcf73d8c 100644
--- a/c/parallel/src/for/for_op_helper.h
+++ b/c/parallel/src/for/for_op_helper.h
@@ -11,7 +11,6 @@
 #pragma once
 
 #include <cstdlib>
-#include <memory>
 #include <string>
 #include <variant>
 
@@ -26,22 +25,9 @@ struct for_each_default
   void* user_op; // A pointer for user data
 };
 
-struct unique_free_void
-{
-  inline void operator()(void* p)
-  {
-    if (p)
-    {
-      free(p);
-    }
-  }
-};
-
-using unique_void = std::unique_ptr<void, unique_free_void>;
-
 struct for_each_kernel_state
 {
-  std::variant<for_each_default, unique_void> for_each_arg;
+  std::variant<for_each_default, std::unique_ptr<char[]>> for_each_arg;
   size_t user_op_offset;
 
   // Get address of argument for kernel
diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
new file mode 100644
index 00000000000..69a1c467f98
--- /dev/null
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -0,0 +1,233 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <tuple>
+
+#include <nvJitLink.h>
+#include <nvrtc.h>
+#include <util/errors.h>
+
+struct nvrtc_cubin
+{
+  std::unique_ptr<char[]> cubin{};
+  size_t size;
+};
+
+struct nvrtc_translation_unit
+{
+  std::string_view program;
+  std::string_view name;
+};
+struct nvrtc_expression
+{
+  std::string_view expression;
+};
+struct nvrtc_get_name
+{
+  std::string_view name;
+  std::string& lowered_name;
+};
+struct nvrtc_compile
+{
+  const char** args;
+  size_t num_args;
+};
+struct nvrtc_program_cleanup
+{};
+struct nvrtc_ltoir
+{
+  const char* ltoir;
+  int ltsz;
+};
+struct nvrtc_jitlink_cleanup
+{
+  nvrtc_cubin& cubin_ref;
+};
+
+struct nvrtc_jitlink
+{
+  nvJitLinkHandle handle;
+
+  nvrtc_jitlink(uint32_t numOpts, const char** opts)
+  {
+    nvJitLinkCreate(&handle, numOpts, opts);
+  }
+
+  ~nvrtc_jitlink()
+  {
+    nvJitLinkDestroy(&handle);
+  }
+};
+
+struct nvrtc_command_list_visitor
+{
+  nvrtc_jitlink& jitlink;
+  std::string_view program_name = "test";
+  nvrtcProgram program{};
+
+  template <typename T, typename... Tx>
+  void operator()(T&& t, Tx&&... rest)
+  {
+    execute(std::forward<T>(t));
+    operator()(std::forward<Tx>(rest)...);
+  }
+  void operator()() {}
+
+  void execute(nvrtc_translation_unit p)
+  {
+    check(nvrtcCreateProgram(&program, p.program.data(), p.name.data(), 0, nullptr, nullptr));
+  }
+  void execute(nvrtc_expression e)
+  {
+    check(nvrtcAddNameExpression(program, e.expression.data()));
+  }
+  void execute(nvrtc_compile c)
+  {
+    auto result = nvrtcCompileProgram(program, c.num_args, c.args);
+
+    size_t log_size{};
+    check(nvrtcGetProgramLogSize(program, &log_size));
+    if (log_size > 1)
+    {
+      std::unique_ptr<char[]> log{new char[log_size]};
+      check(nvrtcGetProgramLog(program, log.get()));
+      std::cerr << log.get() << std::endl;
+    }
+
+    check(result);
+  }
+  void execute(nvrtc_get_name gn)
+  {
+    const char* lowered_name;
+    check(nvrtcGetLoweredName(program, gn.name.data(), &lowered_name));
+    gn.lowered_name = lowered_name;
+  }
+  void execute(nvrtc_program_cleanup)
+  {
+    std::size_t ltoir_size{};
+    check(nvrtcGetLTOIRSize(program, &ltoir_size));
+    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
+    check(nvrtcGetLTOIR(program, ltoir.get()));
+
+    check(nvJitLinkAddData(jitlink.handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, program_name.data()));
+
+    nvrtcDestroyProgram(&program);
+  }
+  void execute(nvrtc_ltoir lto)
+  {
+    check(nvJitLinkAddData(
+      jitlink.handle, NVJITLINK_INPUT_LTOIR, (const void*) lto.ltoir, (size_t) lto.ltsz, program_name.data()));
+  }
+  void execute(nvrtc_jitlink_cleanup cleanup)
+  {
+    auto jitlink_error = nvJitLinkComplete(jitlink.handle);
+
+    size_t log_size{};
+    check(nvJitLinkGetErrorLogSize(jitlink.handle, &log_size));
+    if (log_size > 1)
+    {
+      std::unique_ptr<char[]> log{new char[log_size]};
+      check(nvJitLinkGetErrorLog(jitlink.handle, log.get()));
+      std::cerr << log.get() << std::endl;
+    }
+
+    check(jitlink_error);
+
+    check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size));
+    cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
+    check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+  }
+};
+
+template <typename... Tx, typename T>
+std::tuple<Tx..., T> nvrtc_command_list_append(std::tuple<Tx...>&& tup, T&& a)
+{
+  return std::tuple_cat(std::forward<std::tuple<Tx...>>(tup), std::make_tuple(std::forward<T>(a)));
+}
+
+template <typename... Tx>
+struct nvrtc_sm_top_level;
+template <typename... Tx>
+struct nvrtc_sm_cleanup_tu;
+
+template <typename... Tx>
+struct nvrtc_sm_compilation_unit
+{
+  using command_list = std::tuple<Tx...>;
+  command_list cl{};
+
+  // Add expression before compiling (instantiates global kernel declared in unit)
+  nvrtc_sm_compilation_unit<Tx..., nvrtc_expression> add_expression(nvrtc_expression arg)
+  {
+    return {nvrtc_command_list_append(std::move(cl), std::move(arg))};
+  }
+  // Compile program
+  nvrtc_sm_cleanup_tu<Tx..., nvrtc_compile> compile_program(nvrtc_compile arg)
+  {
+    return {nvrtc_command_list_append(std::move(cl), std::move(arg))};
+  }
+};
+
+template <typename... Tx>
+struct nvrtc_sm_cleanup_tu
+{
+  using command_list = std::tuple<Tx...>;
+  command_list cl{};
+
+  nvrtc_sm_cleanup_tu<Tx..., nvrtc_get_name> get_name(nvrtc_get_name arg)
+  {
+    return {nvrtc_command_list_append(std::move(cl), std::move(arg))};
+  }
+  // Compile program
+  nvrtc_sm_top_level<Tx..., nvrtc_program_cleanup> cleanup_program()
+  {
+    return {nvrtc_command_list_append(std::move(cl), nvrtc_program_cleanup{})};
+  }
+};
+
+template <typename... Tx>
+struct nvrtc_sm_top_level
+{
+  using command_list = std::tuple<Tx...>;
+  command_list cl{};
+
+  // Multiple programs may be linked together
+  nvrtc_sm_compilation_unit<Tx..., nvrtc_translation_unit> add_program(nvrtc_translation_unit arg)
+  {
+    return {nvrtc_command_list_append(std::move(cl), std::move(arg))};
+  }
+  // Add linkable unit to whole program
+  nvrtc_sm_top_level<Tx..., nvrtc_ltoir> add_link(nvrtc_ltoir arg)
+  {
+    return {nvrtc_command_list_append(std::move(cl), std::move(arg))};
+  }
+
+  // Execute steps and link unit
+  nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
+  {
+    nvrtc_cubin cubin{};
+    nvrtc_jitlink_cleanup cleanup{cubin};
+    nvrtc_jitlink jl(numLtoOpts, ltoOpts);
+    std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup)));
+    return cubin;
+  }
+};
+
+static nvrtc_sm_top_level<> make_nvrtc_command_list()
+{
+  return {};
+}
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index a8d111d65af..dd81ee4674f 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -23,8 +23,7 @@
 #include "util/errors.h"
 #include "util/types.h"
 #include <cccl/c/reduce.h>
-#include <nvJitLink.h>
-#include <nvrtc.h>
+#include <nvrtc/command_list.h>
 
 struct op_wrapper;
 struct device_reduce_policy;
@@ -380,7 +379,6 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
 
   try
   {
-    nvrtcProgram prog{};
     const char* name = "test";
 
     const int cc                       = cc_major * 10 + cc_minor;
@@ -528,104 +526,70 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
       op_src, // 6
       policy.vector_load_length); // 7
 
-    check(nvrtcCreateProgram(&prog, src.c_str(), name, 0, nullptr, nullptr));
-
-    std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false);
-    check(nvrtcAddNameExpression(prog, single_tile_kernel_name.c_str()));
-
+    std::string single_tile_kernel_name        = get_single_tile_kernel_name(input_it, output_it, op, init, false);
     std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true);
-    check(nvrtcAddNameExpression(prog, single_tile_second_kernel_name.c_str()));
-
-    std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init);
-    check(nvrtcAddNameExpression(prog, reduction_kernel_name.c_str()));
+    std::string reduction_kernel_name          = get_device_reduce_kernel_name(op, input_it, init);
+    std::string single_tile_kernel_lowered_name;
+    std::string single_tile_second_kernel_lowered_name;
+    std::string reduction_kernel_lowered_name;
 
     const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
 
-    constexpr int num_args     = 7;
+    constexpr size_t num_args  = 7;
     const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
 
-    std::size_t log_size{};
-    nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args);
-
-    check(nvrtcGetProgramLogSize(prog, &log_size));
-
-    std::unique_ptr<char[]> log{new char[log_size]};
-    check(nvrtcGetProgramLog(prog, log.get()));
-
-    if (log_size > 1)
+    constexpr size_t num_lto_args   = 2;
+    const char* lopts[num_lto_args] = {"-lto", arch.c_str()};
+
+    auto cl =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{src.c_str(), name})
+        .add_expression({single_tile_kernel_name})
+        .add_expression({single_tile_second_kernel_name})
+        .add_expression({reduction_kernel_name})
+        .compile_program({args, num_args})
+        .get_name({single_tile_kernel_name, single_tile_kernel_lowered_name})
+        .get_name({single_tile_second_kernel_name, single_tile_second_kernel_lowered_name})
+        .get_name({reduction_kernel_name, reduction_kernel_lowered_name})
+        .cleanup_program()
+        .add_link({op.ltoir, op.ltoir_size});
+
+    nvrtc_cubin result{};
+
+    if (cccl_iterator_kind_t::iterator == input_it.type && cccl_iterator_kind_t::iterator == output_it.type)
     {
-      std::cerr << log.get() << std::endl;
+      result = cl.add_link({input_it.advance.ltoir, input_it.advance.ltoir_size})
+                 .add_link({input_it.dereference.ltoir, input_it.dereference.ltoir_size})
+                 .add_link({output_it.advance.ltoir, output_it.advance.ltoir_size})
+                 .add_link({output_it.dereference.ltoir, output_it.dereference.ltoir_size})
+                 .finalize_program(num_lto_args, lopts);
     }
-
-    const char* single_tile_kernel_lowered_name;
-    check(nvrtcGetLoweredName(prog, single_tile_kernel_name.c_str(), &single_tile_kernel_lowered_name));
-
-    const char* single_tile_second_kernel_lowered_name;
-    check(nvrtcGetLoweredName(prog, single_tile_second_kernel_name.c_str(), &single_tile_second_kernel_lowered_name));
-
-    const char* reduction_kernel_lowered_name;
-    check(nvrtcGetLoweredName(prog, reduction_kernel_name.c_str(), &reduction_kernel_lowered_name));
-
-    // Copy lowered names to a std::unique_ptr to ensure they can be used after
-    // the program is destroyed
-
-    std::unique_ptr<char[]> single_tile_kernel_lowered_name_ptr{new char[strlen(single_tile_kernel_lowered_name) + 1]};
-    strcpy(single_tile_kernel_lowered_name_ptr.get(), single_tile_kernel_lowered_name);
-
-    std::unique_ptr<char[]> single_tile_second_kernel_lowered_name_ptr{
-      new char[strlen(single_tile_second_kernel_lowered_name) + 1]};
-    strcpy(single_tile_second_kernel_lowered_name_ptr.get(), single_tile_second_kernel_lowered_name);
-
-    std::unique_ptr<char[]> reduction_kernel_lowered_name_ptr{new char[strlen(reduction_kernel_lowered_name) + 1]};
-    strcpy(reduction_kernel_lowered_name_ptr.get(), reduction_kernel_lowered_name);
-
-    check(compile_result);
-
-    std::size_t ltoir_size{};
-    check(nvrtcGetLTOIRSize(prog, &ltoir_size));
-    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
-    check(nvrtcGetLTOIR(prog, ltoir.get()));
-    check(nvrtcDestroyProgram(&prog));
-
-    nvJitLinkHandle handle;
-    const char* lopts[] = {"-lto", arch.c_str()};
-    check(nvJitLinkCreate(&handle, 2, lopts));
-
-    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name));
-    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name));
-
-    if (input_it.type == cccl_iterator_kind_t::iterator)
+    else if (cccl_iterator_kind_t::iterator == input_it.type)
     {
-      check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, input_it.advance.ltoir, input_it.advance.ltoir_size, name));
-      check(nvJitLinkAddData(
-        handle, NVJITLINK_INPUT_LTOIR, input_it.dereference.ltoir, input_it.dereference.ltoir_size, name));
+      result = cl.add_link({input_it.advance.ltoir, input_it.advance.ltoir_size})
+                 .add_link({input_it.dereference.ltoir, input_it.dereference.ltoir_size})
+                 .finalize_program(num_lto_args, lopts);
     }
-
-    if (output_it.type == cccl_iterator_kind_t::iterator)
+    else if (cccl_iterator_kind_t::iterator == output_it.type)
     {
-      check(
-        nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output_it.advance.ltoir, output_it.advance.ltoir_size, name));
-      check(nvJitLinkAddData(
-        handle, NVJITLINK_INPUT_LTOIR, output_it.dereference.ltoir, output_it.dereference.ltoir_size, name));
+      result = cl.add_link({output_it.advance.ltoir, output_it.advance.ltoir_size})
+                 .add_link({output_it.dereference.ltoir, output_it.dereference.ltoir_size})
+                 .finalize_program(num_lto_args, lopts);
+    }
+    else
+    {
+      result = cl.finalize_program(num_lto_args, lopts);
     }
 
-    check(nvJitLinkComplete(handle));
-
-    std::size_t cubin_size{};
-    check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
-    std::unique_ptr<char[]> cubin{new char[cubin_size]};
-    check(nvJitLinkGetLinkedCubin(handle, cubin.get()));
-    check(nvJitLinkDestroy(&handle));
-
-    cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
-    check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name_ptr.get()));
+    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(
-      &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name_ptr.get()));
-    check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name_ptr.get()));
+      &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str()));
+    check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = cubin.release();
-    build->cubin_size = cubin_size;
+    build->cubin      = (void*) result.cubin.release();
+    build->cubin_size = result.size;
   }
   catch (...)
   {