From 502daa656b1426e4377ccc9dcd80526fae0bd67d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 06:49:49 -0500
Subject: [PATCH 01/27] Make thread_store.cuh NVRTC compilable

---
 cub/cub/thread/thread_store.cuh | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index 0fb29526cad..d4859c2c174 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 //-----------------------------------------------------------------------------

From 3022bd9bd310c492fa548570db05a2ca38d02674 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 06:51:20 -0500
Subject: [PATCH 02/27] Get TileState from KernelSource

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 31 ++++++++++++-----------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 18b4c4f61c7..93f36dc3b7f 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -83,7 +83,7 @@ struct DeviceScanKernelSource
 {
   using ScanTileStateT = typename cub::ScanTileState<AccumT>;
 
-  CUB_DEFINE_KERNEL_GETTER(ScanInitKernel, DeviceScanInitKernel<ScanTileStateT>)
+  CUB_DEFINE_KERNEL_GETTER(InitKernel, DeviceScanInitKernel<ScanTileStateT>)
 
   CUB_DEFINE_KERNEL_GETTER(
     ScanKernel,
@@ -101,6 +101,11 @@ struct DeviceScanKernelSource
   {
     return sizeof(AccumT);
   }
+
+  CUB_RUNTIME_FUNCTION ScanTileStateT TileState()
+  {
+    return ScanTileStateT();
+  }
 };
 
 } // namespace detail::scan
@@ -165,9 +170,6 @@ struct DispatchScan
 
   static constexpr int INIT_KERNEL_THREADS = 128;
 
-  // The input value type
-  using InputT = cub::detail::value_t<InputIteratorT>;
-
   /// Device-accessible allocation of temporary storage.  When nullptr, the
   /// required allocation size is written to \p temp_storage_bytes and no work
   /// is done.
@@ -258,13 +260,12 @@ struct DispatchScan
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
   Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {})
   {
-    using ScanTileStateT = typename KernelSource::ScanTileStateT;
-
+    // TODO(ashwin): Does this now need to be a runtime check?
     // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
     // performance.
-    static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG,
-                  "The memory consistency model does not apply to texture "
-                  "accesses");
+    // static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG,
+    //               "The memory consistency model does not apply to texture "
+    //               "accesses");
 
     cudaError error = cudaSuccess;
     do
@@ -281,9 +282,12 @@ struct DispatchScan
       int tile_size = policy.Scan().BlockThreads() * policy.Scan().ItemsPerThread();
       int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
 
+      auto tile_state = kernel_source.TileState();
+
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1];
-      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+
+      error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]));
       if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
@@ -313,7 +317,6 @@ struct DispatchScan
       }
 
       // Construct the tile status interface
-      ScanTileStateT tile_state;
       error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
       if (cudaSuccess != error)
       {
@@ -397,17 +400,15 @@ struct DispatchScan
         }
       }
     } while (0);
-
     return error;
   }
 
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
   {
-    using ScanTileStateT = typename KernelSource::ScanTileStateT;
-    auto wrapped_policy  = detail::scan::MakeScanPolicyWrapper(active_policy);
+    auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy);
     // Ensure kernels are instantiated.
-    return Invoke(kernel_source.ScanInitKernel(), kernel_source.ScanKernel(), wrapped_policy);
+    return Invoke(kernel_source.InitKernel(), kernel_source.ScanKernel(), wrapped_policy);
   }
 
   /**

From 418a76340d35c1cb5219a44ff0742a52a5154e3d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 06:51:52 -0500
Subject: [PATCH 03/27] Use launcher_factory to get the SM occupancy, PTX
 version. Implement MaxgridDimX

---
 cub/cub/detail/launcher/cuda_driver.cuh   |  5 +++++
 cub/cub/device/dispatch/dispatch_scan.cuh | 18 ++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh
index 52c643f9707..66e2e8a87a9 100644
--- a/cub/cub/detail/launcher/cuda_driver.cuh
+++ b/cub/cub/detail/launcher/cuda_driver.cuh
@@ -103,6 +103,11 @@ struct CudaDriverLauncherFactory
       cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_fn, block_size, dynamic_smem_bytes));
   }
 
+  _CCCL_HIDE_FROM_ABI cudaError_t MaxGridDimX(int& max_grid_dim_x) const
+  {
+    return static_cast<cudaError_t>(cuDeviceGetAttribute(&max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device));
+  }
+
   CUdevice device;
   int cc;
 };
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 93f36dc3b7f..e56f3f74dcd 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -349,9 +349,10 @@ struct DispatchScan
 
       // Get SM occupancy for scan_kernel
       int scan_sm_occupancy;
-      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
-                                      scan_kernel,
-                                      policy.Scan().BlockThreads()));
+      error = CubDebug(launcher_factory.MaxSmOccupancy(
+        scan_sm_occupancy, // out
+        scan_kernel,
+        policy.Scan().BlockThreads()));
       if (cudaSuccess != error)
       {
         break;
@@ -452,20 +453,20 @@ struct DispatchScan
     InitValueT init_value,
     OffsetT num_items,
     cudaStream_t stream,
-    KernelSource kernel_source = {},
-    MaxPolicyT max_policy      = {})
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {},
+    MaxPolicyT max_policy                  = {})
   {
     cudaError_t error;
     do
     {
       // Get PTX version
       int ptx_version = 0;
-      error           = CubDebug(PtxVersion(ptx_version));
+      error           = CubDebug(launcher_factory.PtxVersion(ptx_version));
       if (cudaSuccess != error)
       {
         break;
       }
-
       // Create dispatch functor
       DispatchScan dispatch(
         d_temp_storage,
@@ -477,7 +478,8 @@ struct DispatchScan
         init_value,
         stream,
         ptx_version,
-        kernel_source);
+        kernel_source,
+        launcher_factory);
 
       // Dispatch to chained policy
       error = CubDebug(max_policy.Invoke(ptx_version, dispatch));

From 6b386058a6c7a53d23ca541fc62e8e8b05935a5f Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 06:54:02 -0500
Subject: [PATCH 04/27] Put reduce stuff inside `reduce` namespace

---
 c/parallel/src/reduce.cu | 110 +++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index da866bd6a76..f39c8a9f944 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -40,6 +40,12 @@ static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "O
 struct nothing_t
 {};
 
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+namespace reduce
+{
+
 struct reduce_runtime_tuning_policy
 {
   int block_size;
@@ -107,9 +113,6 @@ static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*i
   return init.type;
 }
 
-struct input_iterator_state_t;
-struct output_iterator_t;
-
 std::string get_input_iterator_name()
 {
   std::string iterator_t;
@@ -201,6 +204,44 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it
     transform_op_t);
 }
 
+template <auto* GetPolicy>
+struct dynamic_reduce_policy_t
+{
+  using MaxPolicy = dynamic_reduce_policy_t;
+
+  template <typename F>
+  cudaError_t Invoke(int device_ptx_version, F& op)
+  {
+    return op.template Invoke<reduce_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
+  }
+
+  cccl_type_info accumulator_type;
+};
+
+struct reduce_kernel_source
+{
+  cccl_device_reduce_build_result_t& build;
+
+  std::size_t AccumSize() const
+  {
+    return build.accumulator_size;
+  }
+  CUkernel SingleTileKernel() const
+  {
+    return build.single_tile_kernel;
+  }
+  CUkernel SingleTileSecondKernel() const
+  {
+    return build.single_tile_second_kernel;
+  }
+  CUkernel ReductionKernel() const
+  {
+    return build.reduction_kernel;
+  }
+};
+
+} // namespace reduce
+
 extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   cccl_device_reduce_build_result_t* build,
   cccl_iterator_t input_it,
@@ -220,12 +261,12 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   {
     const char* name = "test";
 
-    const int cc                              = cc_major * 10 + cc_minor;
-    const cccl_type_info accum_t              = get_accumulator_type(op, input_it, init);
-    const reduce_runtime_tuning_policy policy = get_policy(cc, accum_t);
-    const auto accum_cpp                      = cccl_type_enum_to_string(accum_t.type);
-    const auto input_it_value_t               = cccl_type_enum_to_string(input_it.value_type.type);
-    const auto offset_t                       = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+    const int cc                 = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init);
+    const auto policy            = reduce::get_policy(cc, accum_t);
+    const auto accum_cpp         = cccl_type_enum_to_string(accum_t.type);
+    const auto input_it_value_t  = cccl_type_enum_to_string(input_it.value_type.type);
+    const auto offset_t          = cccl_type_enum_to_string(cccl_type_enum::UINT64);
 
     const std::string input_iterator_src =
       make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it);
@@ -271,9 +312,10 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
     fflush(stdout);
 #endif
 
-    std::string single_tile_kernel_name        = get_single_tile_kernel_name(input_it, output_it, op, init, false);
-    std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true);
-    std::string reduction_kernel_name          = get_device_reduce_kernel_name(op, input_it, init);
+    std::string single_tile_kernel_name = reduce::get_single_tile_kernel_name(input_it, output_it, op, init, false);
+    std::string single_tile_second_kernel_name =
+      reduce::get_single_tile_kernel_name(input_it, output_it, op, init, true);
+    std::string reduction_kernel_name = reduce::get_device_reduce_kernel_name(op, input_it, init);
     std::string single_tile_kernel_lowered_name;
     std::string single_tile_second_kernel_lowered_name;
     std::string reduction_kernel_lowered_name;
@@ -342,42 +384,6 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   return error;
 }
 
-template <auto* GetPolicy>
-struct dynamic_reduce_policy_t
-{
-  using MaxPolicy = dynamic_reduce_policy_t;
-
-  template <typename F>
-  cudaError_t Invoke(int device_ptx_version, F& op)
-  {
-    return op.template Invoke<reduce_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
-  }
-
-  cccl_type_info accumulator_type;
-};
-
-struct reduce_kernel_source
-{
-  cccl_device_reduce_build_result_t& build;
-
-  std::size_t AccumSize() const
-  {
-    return build.accumulator_size;
-  }
-  CUkernel SingleTileKernel() const
-  {
-    return build.single_tile_kernel;
-  }
-  CUkernel SingleTileSecondKernel() const
-  {
-    return build.single_tile_second_kernel;
-  }
-  CUkernel ReductionKernel() const
-  {
-    return build.reduction_kernel;
-  }
-};
-
 extern "C" CCCL_C_API CUresult cccl_device_reduce(
   cccl_device_reduce_build_result_t build,
   void* d_temp_storage,
@@ -404,9 +410,9 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
                         indirect_arg_t,
                         indirect_arg_t,
                         void,
-                        dynamic_reduce_policy_t<&get_policy>,
+                        reduce::dynamic_reduce_policy_t<&reduce::get_policy>,
                         ::cuda::std::__identity,
-                        reduce_kernel_source,
+                        reduce::reduce_kernel_source,
                         cub::detail::CudaDriverLauncherFactory>::
       Dispatch(
         d_temp_storage,
@@ -420,7 +426,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
         {},
         {build},
         cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-        {get_accumulator_type(op, d_in, init)});
+        {reduce::get_accumulator_type(op, d_in, init)});
   }
   catch (const std::exception& exc)
   {
@@ -439,7 +445,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
   return error;
 }
 
-extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr)
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept
 {
   try
   {

From 6365d5ca5a53ff11136553ea0cbf925f9b95e5aa Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:25:09 -0500
Subject: [PATCH 05/27] Handle PTX compilation in command list

---
 c/parallel/src/nvrtc/command_list.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
index 77d83e86d47..719d9c5bf88 100644
--- a/c/parallel/src/nvrtc/command_list.h
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor
 
     check(jitlink_error);
 
-    check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size));
+    bool ptx    = false;
+    auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size);
+    if (result != NVJITLINK_SUCCESS)
+    {
+      ptx    = true;
+      result = nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size);
+    }
     cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
-    check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+
+    if (ptx)
+    {
+      check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+    }
+    else
+    {
+      check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+    }
   }
 };
 

From f3454c58a50f7eb7857ad53455bd5bbf3e06fb57 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:25:36 -0500
Subject: [PATCH 06/27] Missing noexcept

---
 c/parallel/include/cccl/c/reduce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c/parallel/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h
index f1e875087a7..2366024a532 100644
--- a/c/parallel/include/cccl/c/reduce.h
+++ b/c/parallel/include/cccl/c/reduce.h
@@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
   cccl_value_t init,
   CUstream stream) noexcept;
 
-extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept;

From 3fe0dedeecd2bc13fce0664ed6ee5646ed105907 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:21:00 -0500
Subject: [PATCH 07/27] Allow passing InitValueT without wrapping in InputValue

---
 cub/cub/device/dispatch/kernels/scan.cuh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh
index cc3034638bc..7cb71ca49e7 100644
--- a/cub/cub/device/dispatch/kernels/scan.cuh
+++ b/cub/cub/device/dispatch/kernels/scan.cuh
@@ -40,6 +40,8 @@
 #include <cub/agent/agent_scan.cuh>
 #include <cub/util_macro.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -159,7 +161,9 @@ template <typename ChainedPolicyT,
           typename InitValueT,
           typename OffsetT,
           typename AccumT,
-          bool ForceInclusive>
+          bool ForceInclusive,
+          typename RealInitValueT = ::cuda::std::
+            _If<::cuda::std::is_void_v<typename InitValueT::value_type>, InitValueT, typename InitValueT::value_type>>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     InputIteratorT d_in,
@@ -170,8 +174,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
     InitValueT init_value,
     OffsetT num_items)
 {
-  using RealInitValueT = typename InitValueT::value_type;
-  using ScanPolicyT    = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
+  using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
 
   // Thread block type for scanning input tiles
   using AgentScanT = detail::scan::

From e4a1aeaf3bb975a5168de95fa09bf05e1f13ec09 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:26:17 -0500
Subject: [PATCH 08/27] Add scan c.parallel API

---
 c/parallel/include/cccl/c/scan.h |  57 +++
 c/parallel/src/scan.cu           | 585 +++++++++++++++++++++++++++++++
 2 files changed, 642 insertions(+)
 create mode 100644 c/parallel/include/cccl/c/scan.h
 create mode 100644 c/parallel/src/scan.cu

diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h
new file mode 100644
index 00000000000..df9f09588a5
--- /dev/null
+++ b/c/parallel/include/cccl/c/scan.h
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA Core Compute Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
+
+#include <cuda.h>
+
+#include <cccl/c/types.h>
+
+struct cccl_device_scan_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  cccl_type_info accumulator_type;
+  CUkernel init_kernel;
+  CUkernel scan_kernel;
+  void* tile_state;
+};
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_build(
+  cccl_device_scan_build_result_t* build,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_scan(
+  cccl_device_scan_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept;
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
new file mode 100644
index 00000000000..2c259cbcd27
--- /dev/null
+++ b/c/parallel/src/scan.cu
@@ -0,0 +1,585 @@
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/launcher/cuda_driver.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_temporary_storage.cuh>
+#include <cub/util_type.cuh>
+
+#include <format>
+#include <iostream>
+#include <regex>
+#include <string>
+#include <type_traits>
+
+#include "cub/util_device.cuh"
+#include "kernels/iterators.h"
+#include "kernels/operators.h"
+#include "util/context.h"
+#include "util/errors.h"
+#include "util/indirect_arg.h"
+#include "util/types.h"
+#include <cccl/c/scan.h>
+#include <nvrtc.h>
+#include <nvrtc/command_list.h>
+
+struct op_wrapper;
+struct device_scan_policy;
+using OffsetT = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+namespace scan
+{
+
+struct scan_runtime_tuning_policy
+{
+  int block_size;
+  int items_per_thread;
+  cub::CacheLoadModifier load_modifier;
+
+  scan_runtime_tuning_policy Scan() const
+  {
+    return *this;
+  }
+
+  int ItemsPerThread() const
+  {
+    return items_per_thread;
+  }
+
+  int BlockThreads() const
+  {
+    return block_size;
+  }
+
+  cub::CacheLoadModifier LoadModifier() const
+  {
+    return load_modifier;
+  }
+};
+
+template <typename Tuning, int N>
+Tuning find_tuning(int cc, const Tuning (&tunings)[N])
+{
+  for (const Tuning& tuning : tunings)
+  {
+    if (cc >= tuning.cc)
+    {
+      return tuning;
+    }
+  }
+
+  return tunings[N - 1];
+}
+
+scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/)
+{
+  // TODO(ashwin): returning some default for now:
+  return {128, 15, cub::LOAD_CA};
+}
+
+static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
+{
+  // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
+  //      so switching back to the old accumulator type logic for now
+  return init.type;
+}
+
+std::string get_input_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<input_iterator_state_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_output_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<output_iterator_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string
+get_init_kernel_name(cccl_iterator_t input_it, cccl_iterator_t /*output_it*/, cccl_op_t op, cccl_value_t init)
+{
+  const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  return std::format("cub::detail::scan::DeviceScanInitKernel<cub::ScanTileState<{0}>>", accum_cpp_t);
+}
+
+std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_scan_policy>(&chained_policy_t));
+
+  const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  const std::string input_iterator_t =
+    (input_it.type == cccl_iterator_kind_t::pointer //
+       ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+       : scan::get_input_iterator_name());
+  const std::string output_iterator_t =
+    output_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(output_it.value_type.type, true) //
+      : scan::get_output_iterator_name();
+  const std::string init_t = cccl_type_enum_to_name(init.type.type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string scan_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&scan_op_t));
+
+  auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t);
+  return std::format(
+    "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>",
+    chained_policy_t,
+    input_iterator_t,
+    output_iterator_t,
+    tile_state_t,
+    scan_op_t,
+    init_t,
+    offset_t,
+    accum_cpp_t,
+    "false", // for now, always exclusive
+    init_t);
+}
+
+size_t find_size_t(char* ptx, std::string_view name)
+{
+  std::regex regex(std::format(R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)", name));
+  std::cmatch match;
+  if (std::regex_search(ptx, match, regex))
+  {
+    auto result = std::stoi(match[1].str());
+    return result;
+  }
+  else
+  {
+    throw std::runtime_error(std::format("Could not find {} in PTX code", name));
+  }
+}
+
+struct scan_tile_state
+{
+  // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except
+  // that it accepts the acummulator type as a runtime parameter rather than being
+  // templated on it.
+  //
+  // Both specializations ScanTileStateT<T, true> and ScanTileStateT<T, false> - where the
+  // bool parameter indicates whether `T` is primitive - are combined into a single type.
+
+  void* d_tile_status; // d_tile_descriptors
+  void* d_tile_partial;
+  void* d_tile_inclusive;
+
+  bool is_primitive;
+  size_t status_size;
+  size_t uninitialized_size;
+
+  scan_tile_state(bool is_primitive, size_t status_size, size_t uninitialized_size)
+      : d_tile_status(nullptr)
+      , d_tile_partial(nullptr)
+      , d_tile_inclusive(nullptr)
+      , is_primitive(is_primitive)
+      , status_size(status_size)
+      , uninitialized_size(uninitialized_size)
+  {}
+
+  cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
+  {
+    return is_primitive ? InitPrimitive(num_tiles, d_temp_storage, temp_storage_bytes)
+                        : InitStorage(num_tiles, d_temp_storage, temp_storage_bytes);
+  }
+
+  cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
+  {
+    return is_primitive ? AllocationSizePrimitive(num_tiles, temp_storage_bytes)
+                        : AllocationSizeStorage(num_tiles, temp_storage_bytes);
+  }
+
+  cudaError_t InitPrimitive(int, void* d_temp_storage, size_t)
+  {
+    d_tile_status = d_temp_storage;
+    return cudaSuccess;
+  }
+
+  cudaError_t AllocationSizePrimitive(int num_tiles, size_t& d_temp_storage_bytes) const
+  {
+    d_temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * status_size;
+    return cudaSuccess;
+  }
+
+  cudaError_t InitStorage(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
+  {
+    cudaError_t error = cudaSuccess;
+    do
+    {
+      void* allocations[3] = {};
+      size_t allocation_sizes[3];
+
+      // bytes needed for tile status descriptors
+      allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size;
+
+      // bytes needed for partials
+      allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
+
+      // bytes needed for inclusives
+      allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
+
+      // Compute allocation pointers into the single storage blob
+      error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Alias the offsets
+      d_tile_status    = allocations[0];
+      d_tile_partial   = allocations[1];
+      d_tile_inclusive = allocations[2];
+    } while (0);
+
+    return error;
+  }
+
+  cudaError_t AllocationSizeStorage(int num_tiles, size_t& d_temp_storage_bytes) const
+  {
+    // Specify storage allocation requirements
+    size_t allocation_sizes[3];
+
+    // bytes needed for tile status descriptors
+    allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size;
+
+    // bytes needed for partials
+    allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
+
+    // bytes needed for inclusives
+    allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
+
+    // Set the necessary size of the blob
+    void* allocations[3] = {};
+    return CubDebug(cub::AliasTemporaries(nullptr, d_temp_storage_bytes, allocations, allocation_sizes));
+  }
+};
+
+template <auto* GetPolicy>
+struct dynamic_scan_policy_t
+{
+  using MaxPolicy = dynamic_scan_policy_t;
+
+  template <typename F>
+  cudaError_t Invoke(int device_ptx_version, F& op)
+  {
+    return op.template Invoke<scan_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
+  }
+
+  cccl_type_info accumulator_type;
+};
+
+struct scan_kernel_source
+{
+  cccl_device_scan_build_result_t& build;
+
+  std::size_t AccumSize() const
+  {
+    return build.accumulator_type.size;
+  }
+  CUkernel InitKernel() const
+  {
+    return build.init_kernel;
+  }
+  CUkernel ScanKernel() const
+  {
+    return build.scan_kernel;
+  }
+  scan_tile_state TileState()
+  {
+    auto result = (reinterpret_cast<scan_tile_state*>(build.tile_state));
+    return *result;
+  }
+};
+
+} // namespace scan
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_build(
+  cccl_device_scan_build_result_t* build,
+  cccl_iterator_t input_it,
+  cccl_iterator_t output_it,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    const char* name = "test";
+
+    const int cc                 = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init);
+    const auto policy            = scan::get_policy(cc, accum_t);
+    const auto accum_cpp         = cccl_type_enum_to_string(accum_t.type);
+    const auto input_it_value_t  = cccl_type_enum_to_string(input_it.value_type.type);
+    const auto offset_t          = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string input_iterator_src  = make_kernel_input_iterator(offset_t, input_it_value_t, input_it);
+    const std::string output_iterator_src = make_kernel_output_iterator(offset_t, accum_cpp, output_it);
+
+    const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op);
+
+    const std::string src = std::format(
+      "#include <cub/block/block_scan.cuh>\n"
+      "#include <cub/device/dispatch/kernels/scan.cuh>\n"
+      "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "{4}\n"
+      "{5}\n"
+      "struct agent_policy_t {{\n"
+      "  static constexpr int ITEMS_PER_THREAD = {2};\n"
+      "  static constexpr int BLOCK_THREADS = {3};\n"
+      "  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_WARP_TRANSPOSE;\n"
+      "  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;\n"
+      "  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = cub::BLOCK_STORE_WARP_TRANSPOSE;\n"
+      "  static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS;\n"
+      "  struct detail {{\n"
+      "    using delay_constructor_t = cub::detail::default_delay_constructor_t<{7}>;\n"
+      "  }};\n"
+      "}};\n"
+      "struct device_scan_policy {{\n"
+      "  struct ActivePolicy {{\n"
+      "    using ScanPolicyT = agent_policy_t;\n"
+      "  }};\n"
+      "}};\n"
+      "{6};\n",
+      input_it.value_type.size, // 0
+      input_it.value_type.alignment, // 1
+      policy.items_per_thread, // 2
+      policy.block_size, // 3
+      input_iterator_src, // 4
+      output_iterator_src, // 5
+      op_src, // 6
+      accum_cpp); // 7
+
+#if false // CCCL_DEBUGGING_SWITCH
+    fflush(stderr);
+    printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str());
+    fflush(stdout);
+#endif
+
+    std::string init_kernel_name = scan::get_init_kernel_name(input_it, output_it, op, init);
+    std::string scan_kernel_name = scan::get_scan_kernel_name(input_it, output_it, op, init);
+    std::string init_kernel_lowered_name;
+    std::string scan_kernel_lowered_name;
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr size_t num_args  = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    constexpr size_t num_lto_args   = 2;
+    const char* lopts[num_lto_args] = {"-lto", arch.c_str()};
+
+    // Collect all LTO-IRs to be linked.
+    nvrtc_ltoir_list ltoir_list;
+    auto ltoir_list_append = [&ltoir_list](nvrtc_ltoir lto) {
+      if (lto.ltsz)
+      {
+        ltoir_list.push_back(std::move(lto));
+      }
+    };
+    ltoir_list_append({op.ltoir, op.ltoir_size});
+    if (cccl_iterator_kind_t::iterator == input_it.type)
+    {
+      ltoir_list_append({input_it.advance.ltoir, input_it.advance.ltoir_size});
+      ltoir_list_append({input_it.dereference.ltoir, input_it.dereference.ltoir_size});
+    }
+    if (cccl_iterator_kind_t::iterator == output_it.type)
+    {
+      ltoir_list_append({output_it.advance.ltoir, output_it.advance.ltoir_size});
+      ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size});
+    }
+
+    nvrtc_cubin result =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{src.c_str(), name})
+        .add_expression({init_kernel_name})
+        .add_expression({scan_kernel_name})
+        .compile_program({args, num_args})
+        .get_name({init_kernel_name, init_kernel_lowered_name})
+        .get_name({scan_kernel_name, scan_kernel_lowered_name})
+        .cleanup_program()
+        .add_link_list(ltoir_list)
+        .finalize_program(num_lto_args, lopts);
+
+    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str()));
+    check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str()));
+
+    constexpr size_t num_ptx_args           = 5;
+    const char* ptx_args[num_ptx_args]      = {arch.c_str(), cub_path, libcudacxx_path, "-rdc=true", "-dlto"};
+    constexpr size_t num_ptx_lto_args       = 3;
+    const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
+
+    size_t status_size{0};
+    size_t uninitialized_size{0};
+    if (accum_t.type == cccl_type_enum::STORAGE)
+    {
+      std::string src = std::format(
+        "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+        "#include <cub/util_type.cuh>\n"
+        "struct __align__({1}) storage_t {{\n"
+        "  char data[{0}];\n"
+        "}};\n"
+        "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{2}>::StatusWord);\n"
+        "__device__ size_t uninitialized_size = sizeof(cub::Uninitialized<{2}>);\n",
+        accum_t.size,
+        accum_t.alignment,
+        accum_cpp);
+      auto compile_result =
+        make_nvrtc_command_list()
+          .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"})
+          .compile_program({ptx_args, num_ptx_args})
+          .cleanup_program()
+          .finalize_program(num_ptx_lto_args, ptx_lopts);
+      auto ptx_code      = compile_result.cubin.get();
+      status_size        = scan::find_size_t(ptx_code, "status_size");
+      uninitialized_size = scan::find_size_t(ptx_code, "uninitialized_size");
+    }
+    else
+    {
+      std::string src = std::format(
+        "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+        "#include <cub/util_type.cuh>\n"
+        "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{0}>::TxnWord);\n",
+        accum_cpp);
+      auto compile_result =
+        make_nvrtc_command_list()
+          .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"})
+          .compile_program({ptx_args, num_ptx_args})
+          .cleanup_program()
+          .finalize_program(num_ptx_lto_args, ptx_lopts);
+      auto ptx_code = compile_result.cubin.get();
+      status_size   = scan::find_size_t(ptx_code, "status_size");
+    }
+
+    bool is_primitive = not(accum_t.type == cccl_type_enum::STORAGE);
+    auto tile_state   = std::make_unique<scan::scan_tile_state>(is_primitive, status_size, uninitialized_size);
+
+    build->cc               = cc;
+    build->cubin            = (void*) result.cubin.release();
+    build->cubin_size       = result.size;
+    build->accumulator_type = accum_t;
+    build->tile_state       = (void*) tile_state.release();
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan_build(): %s\n", exc.what());
+    fflush(stdout);
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_scan(
+  cccl_device_scan_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+  try
+  {
+    pushed = try_push_context();
+
+    CUdevice cu_device;
+    check(cuCtxGetDevice(&cu_device));
+    auto cuda_error = cub::DispatchScan<
+      indirect_arg_t,
+      indirect_arg_t,
+      indirect_arg_t,
+      indirect_arg_t,
+      ::cuda::std::size_t,
+      void,
+      scan::dynamic_scan_policy_t<&scan::get_policy>,
+      false,
+      scan::scan_kernel_source,
+      cub::detail::CudaDriverLauncherFactory>::
+      Dispatch(
+        d_temp_storage,
+        *temp_storage_bytes,
+        d_in,
+        d_out,
+        op,
+        init,
+        num_items,
+        stream,
+        {build},
+        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
+        {scan::get_accumulator_type(op, d_in, init)});
+    if (cuda_error != cudaSuccess)
+    {
+      const char* errorString = cudaGetErrorString(cuda_error); // Get the error string
+      std::cerr << "CUDA error: " << errorString << std::endl;
+    }
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan(): %s\n", exc.what());
+    fflush(stdout);
+    error = CUDA_ERROR_UNKNOWN;
+  }
+  if (pushed)
+  {
+    CUcontext cu_context;
+    cuCtxPopCurrent(&cu_context);
+  }
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+
+    std::unique_ptr<scan::scan_tile_state> tile_state(reinterpret_cast<scan::scan_tile_state*>(bld_ptr->tile_state));
+    tile_state.reset();
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan_cleanup(): %s\n", exc.what());
+    fflush(stdout);
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}

From c953d6ed9dc4909798e7323741e8e6220fef6ba0 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:26:34 -0500
Subject: [PATCH 09/27] Add tests for scan c.parallel API

---
 c/parallel/test/test_scan.cpp | 237 ++++++++++++++++++++++++++++++++++
 c/parallel/test/test_util.h   |   1 +
 2 files changed, 238 insertions(+)
 create mode 100644 c/parallel/test/test_scan.cpp

diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp
new file mode 100644
index 00000000000..f7373416d74
--- /dev/null
+++ b/c/parallel/test/test_scan.cpp
@@ -0,0 +1,237 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "test_util.h"
+
+void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init)
+{
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  const int cc_major = deviceProp.major;
+  const int cc_minor = deviceProp.minor;
+
+  const char* cub_path        = TEST_CUB_PATH;
+  const char* thrust_path     = TEST_THRUST_PATH;
+  const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
+  const char* ctk_path        = TEST_CTK_PATH;
+
+  cccl_device_scan_build_result_t build;
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_scan_build(
+            &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
+
+  const std::string sass = inspect_sass(build.cubin, build.cubin_size);
+  // TODO(ashwin): do we need the below?
+  // REQUIRE(sass.find("LDL") == std::string::npos);
+  // REQUIRE(sass.find("STL") == std::string::npos);
+
+  size_t temp_storage_bytes = 0;
+  REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+
+  pointer_t<uint8_t> temp_storage(temp_storage_bytes);
+
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_scan(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+  REQUIRE(CUDA_SUCCESS == cccl_device_scan_cleanup(&build));
+}
+
+using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
+TEMPLATE_LIST_TEST_CASE("Scan works with integral types", "[scan]", integral_types)
+{
+  const std::size_t num_items       = GENERATE(0, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                    = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
+  const std::vector<TestType> input = generate<TestType>(num_items);
+  const std::vector<TestType> output(num_items, 0);
+  pointer_t<TestType> input_ptr(input);
+  pointer_t<TestType> output_ptr(output);
+  value_t<TestType> init{TestType{42}};
+
+  scan(input_ptr, output_ptr, num_items, op, init);
+
+  std::vector<TestType> expected(num_items, 0);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<TestType>(output_ptr));
+  }
+}
+
+struct pair
+{
+  short a;
+  size_t b;
+
+  bool operator==(const pair& other) const
+  {
+    return a == other.a && b == other.b;
+  }
+};
+
+TEST_CASE("Scan works with custom types", "[scan]")
+{
+  const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation(
+    "op",
+    "struct pair { short a; size_t b; };\n"
+    "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n"
+    "  return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n"
+    "}");
+  const std::vector<short> a  = generate<short>(num_items);
+  const std::vector<size_t> b = generate<size_t>(num_items);
+  std::vector<pair> input(num_items);
+  std::vector<pair> output(num_items);
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    input[i] = pair{a[i], b[i]};
+  }
+  pointer_t<pair> input_ptr(input);
+  pointer_t<pair> output_ptr(output);
+  value_t<pair> init{pair{4, 2}};
+
+  scan(input_ptr, output_ptr, num_items, op, init);
+
+  std::vector<pair> expected(num_items, {0, 0});
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value, [](const pair& lhs, const pair& rhs) {
+    return pair{short(lhs.a + rhs.a), lhs.b + rhs.b};
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<pair>(output_ptr));
+  }
+}
+
+struct counting_iterator_state_t
+{
+  int value;
+};
+
+TEST_CASE("Scan works with input iterators", "[scan]")
+{
+  const std::size_t num_items                         = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, counting_iterator_state_t> input_it = make_iterator<int, counting_iterator_state_t>(
+    "struct counting_iterator_state_t { int value; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->value += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 0;
+  pointer_t<int> output_it(num_items);
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  // vector storing a sequence of values 0, 1, 2, ..., num_items - 1
+  std::vector<int> input(num_items);
+  std::iota(input.begin(), input.end(), 0);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(output_it));
+  }
+}
+
+struct transform_output_iterator_state_t
+{
+  int* d_output;
+};
+
+TEST_CASE("Scan works with output iterators", "[scan]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_it(input);
+  pointer_t<int> inner_output_it(num_items);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+
+  std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) {
+    return x * 2;
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(inner_output_it));
+  }
+}
+
+template <class T>
+struct constant_iterator_state_t
+{
+  T value;
+};
+
+TEST_CASE("Scan works with input and output iterators", "[scan]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
+    "struct constant_iterator_state_t { int value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value                                         = 1;
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"out_advance",
+     "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"out_dereference",
+     "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  pointer_t<int> inner_output_it(num_items);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  std::vector<int> expected(num_items, 1);
+  std::exclusive_scan(expected.begin(), expected.end(), expected.begin(), init.value);
+  std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) {
+    return x * 2;
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(inner_output_it));
+  }
+}
diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h
index 3f9a25dc43e..cf7fb2851a3 100644
--- a/c/parallel/test/test_util.h
+++ b/c/parallel/test/test_util.h
@@ -27,6 +27,7 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators_all.hpp>
 #include <cccl/c/reduce.h>
+#include <cccl/c/scan.h>
 #include <nvrtc.h>
 
 static std::string inspect_sass(const void* cubin, size_t cubin_size)

From 59a125abc775a269a9f86a8566a101b21040e468 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:41:25 -0800
Subject: [PATCH 10/27] Use fewer items per thread and reinstate LDL/STL check

---
 c/parallel/src/scan.cu        | 6 ++++--
 c/parallel/test/test_scan.cpp | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 2c259cbcd27..7317dcd377d 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -78,8 +78,10 @@ Tuning find_tuning(int cc, const Tuning (&tunings)[N])
 
 scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/)
 {
-  // TODO(ashwin): returning some default for now:
-  return {128, 15, cub::LOAD_CA};
+  // TODO: we should update this once we figure out a way to reuse
+  // tuning logic from C++. Alternately, we should implement
+  // something better than a hardcoded default:
+  return {128, 4, cub::LOAD_DEFAULT};
 }
 
 static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp
index f7373416d74..9ce88835caf 100644
--- a/c/parallel/test/test_scan.cpp
+++ b/c/parallel/test/test_scan.cpp
@@ -33,9 +33,9 @@ void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_
             &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
 
   const std::string sass = inspect_sass(build.cubin, build.cubin_size);
-  // TODO(ashwin): do we need the below?
-  // REQUIRE(sass.find("LDL") == std::string::npos);
-  // REQUIRE(sass.find("STL") == std::string::npos);
+
+  REQUIRE(sass.find("LDL") == std::string::npos);
+  REQUIRE(sass.find("STL") == std::string::npos);
 
   size_t temp_storage_bytes = 0;
   REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0));

From 9925e113e6e71e253c5f6dae125edcd71c8e709b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:58:04 -0800
Subject: [PATCH 11/27] Move load modifier check to policy

---
 c/parallel/src/scan.cu                         | 9 +++++++++
 cub/cub/device/dispatch/dispatch_scan.cuh      | 5 +----
 cub/cub/device/dispatch/tuning/tuning_scan.cuh | 9 ++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 7317dcd377d..33e5ea9cbb9 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -60,6 +60,15 @@ struct scan_runtime_tuning_policy
   {
     return load_modifier;
   }
+
+  void CheckLoadModifier() const
+  {
+    if (LoadModifier() == cub::CacheLoadModifier::LOAD_LDG)
+    {
+      throw std::runtime_error("The memory consistency model does not apply to texture "
+                               "accesses");
+    }
+  }
 };
 
 template <typename Tuning, int N>
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index e56f3f74dcd..85834b10a35 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -260,12 +260,9 @@ struct DispatchScan
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
   Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {})
   {
-    // TODO(ashwin): Does this now need to be a runtime check?
     // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
     // performance.
-    // static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG,
-    //               "The memory consistency model does not apply to texture "
-    //               "accesses");
+    policy.CheckLoadModifier();
 
     cudaError error = cudaSuccess;
     do
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 81c462bf7fa..1d4969714e8 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -433,10 +433,17 @@ struct ScanPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(StaticPolic
     return cub::detail::MakePolicyWrapper(typename StaticPolicyT::ScanPolicyT());
   }
 
-  CUB_RUNTIME_FUNCTION constexpr CacheLoadModifier LoadModifier()
+  CUB_RUNTIME_FUNCTION static constexpr CacheLoadModifier LoadModifier()
   {
     return StaticPolicyT::ScanPolicyT::LOAD_MODIFIER;
   }
+
+  CUB_RUNTIME_FUNCTION constexpr void CheckLoadModifier()
+  {
+    static_assert(LoadModifier() != CacheLoadModifier::LOAD_LDG,
+                  "The memory consistency model does not apply to texture "
+                  "accesses");
+  }
 };
 
 template <typename PolicyT>

From c71bebe256b3778e291fc00640c9f178b6b6c3e5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 31 Jan 2025 11:24:08 -0800
Subject: [PATCH 12/27] Introduce detail functions to allocate/initialize tile
 state

---
 cub/cub/agent/single_pass_scan_operators.cuh | 111 +++++++++++++------
 1 file changed, 76 insertions(+), 35 deletions(-)

diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index e605e4082f3..7c3bbc3677e 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -540,6 +540,65 @@ struct tile_state_with_memory_order
     return tile_state.template LoadValid<Order>(tile_idx);
   }
 };
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(int num_tiles)
+{
+  return CUB_PTX_WARP_THREADS + num_tiles;
+}
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t
+tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles)
+{
+  // Specify storage allocation requirements
+  size_t allocation_sizes[3];
+
+  int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
+
+  // bytes needed for tile status descriptors
+  allocation_sizes[0] = num_tile_states * bytes_per_description;
+
+  // bytes needed for partials
+  allocation_sizes[1] = num_tile_states * bytes_per_payload;
+
+  // bytes needed for inclusives
+  allocation_sizes[2] = num_tile_states * bytes_per_payload;
+
+  // Set the necessary size of the blob
+  size_t temp_storage_bytes = 0;
+  void* allocations[3]      = {};
+  AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes);
+
+  return temp_storage_bytes;
+};
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
+  int bytes_per_description,
+  int bytes_per_payload,
+  int num_tiles,
+  void* d_temp_storage,
+  size_t temp_storage_bytes,
+  void* (&allocations)[3])
+{
+  // Specify storage allocation requirements
+  size_t allocation_sizes[3];
+
+  int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
+
+  // bytes needed for tile status descriptors
+  allocation_sizes[0] = num_tile_states * bytes_per_description;
+
+  // bytes needed for partials
+  allocation_sizes[1] = num_tile_states * bytes_per_payload;
+
+  // bytes needed for inclusives
+  allocation_sizes[2] = num_tile_states * bytes_per_payload;
+
+  // Set the necessary size of the blob
+  AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+
+  return cudaSuccess;
+}
+
 } // namespace detail
 
 /**
@@ -583,6 +642,9 @@ struct ScanTileState<T, true>
   // Device storage
   TxnWord* d_tile_descriptors;
 
+  static constexpr size_t description_bytes_per_tile = sizeof(TxnWord);
+  static constexpr size_t payload_bytes_per_tile     = 0;
+
   /// Constructor
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
       : d_tile_descriptors(nullptr)
@@ -618,10 +680,11 @@ struct ScanTileState<T, true>
    * @param[out] temp_storage_bytes
    *   Size in bytes of \t d_temp_storage allocation
    */
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
+  AllocationSize(int num_tiles, size_t& temp_storage_bytes)
   {
-    // bytes needed for tile status descriptors
-    temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
+    temp_storage_bytes =
+      detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
     return cudaSuccess;
   }
 
@@ -782,6 +845,9 @@ struct ScanTileState<T, false>
   T* d_tile_partial;
   T* d_tile_inclusive;
 
+  static constexpr size_t description_bytes_per_tile = sizeof(StatusWord);
+  static constexpr size_t payload_bytes_per_tile     = sizeof(Uninitialized<T>);
+
   /// Constructor
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
       : d_tile_status(nullptr)
@@ -810,25 +876,12 @@ struct ScanTileState<T, false>
     do
     {
       void* allocations[3] = {};
-      size_t allocation_sizes[3];
-
-      // bytes needed for tile status descriptors
-      allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
-
-      // bytes needed for partials
-      allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-      // bytes needed for inclusives
-      allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-      // Compute allocation pointers into the single storage blob
-      error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-
+      error                = detail::tile_state_init(
+        description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
       if (cudaSuccess != error)
       {
         break;
       }
-
       // Alias the offsets
       d_tile_status    = reinterpret_cast<StatusWord*>(allocations[0]);
       d_tile_partial   = reinterpret_cast<T*>(allocations[1]);
@@ -847,25 +900,13 @@ struct ScanTileState<T, false>
    * @param[out] temp_storage_bytes
    *   Size in bytes of \t d_temp_storage allocation
    */
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
+  AllocationSize(int num_tiles, size_t& temp_storage_bytes)
   {
-    // Specify storage allocation requirements
-    size_t allocation_sizes[3];
-
-    // bytes needed for tile status descriptors
-    allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
-
-    // bytes needed for partials
-    allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-    // bytes needed for inclusives
-    allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-    // Set the necessary size of the blob
-    void* allocations[3] = {};
-    return CubDebug(detail::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes));
+    temp_storage_bytes =
+      detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
+    return cudaSuccess;
   }
-
   /**
    * Initialize (from device)
    */

From 08d443eb3ccd45d75f5830a0591b66becb17cf7b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 1 Feb 2025 05:18:59 -0800
Subject: [PATCH 13/27] Update c.parallel scan_tile_state following c++
 refactor

---
 c/parallel/src/scan.cu | 178 ++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 120 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 33e5ea9cbb9..e4e23cd0f09 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -160,9 +160,11 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
     init_t);
 }
 
+static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
+
 size_t find_size_t(char* ptx, std::string_view name)
 {
-  std::regex regex(std::format(R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)", name));
+  std::regex regex(std::format(ptx_u64_assignment_regex, name));
   std::cmatch match;
   if (std::regex_search(ptx, match, regex))
   {
@@ -175,6 +177,20 @@ size_t find_size_t(char* ptx, std::string_view name)
   }
 }
 
+size_t find_size_t(char* ptx, std::string_view name, size_t default_value)
+{
+  std::regex regex(std::format(ptx_u64_assignment_regex, name));
+  std::cmatch match;
+  if (std::regex_search(ptx, match, regex))
+  {
+    return std::stoi(match[1].str());
+  }
+  else
+  {
+    return default_value;
+  }
+}
+
 struct scan_tile_state
 {
   // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except
@@ -188,95 +204,38 @@ struct scan_tile_state
   void* d_tile_partial;
   void* d_tile_inclusive;
 
-  bool is_primitive;
-  size_t status_size;
-  size_t uninitialized_size;
+  size_t description_bytes_per_tile;
+  size_t payload_bytes_per_tile;
 
-  scan_tile_state(bool is_primitive, size_t status_size, size_t uninitialized_size)
+  scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile)
       : d_tile_status(nullptr)
       , d_tile_partial(nullptr)
       , d_tile_inclusive(nullptr)
-      , is_primitive(is_primitive)
-      , status_size(status_size)
-      , uninitialized_size(uninitialized_size)
+      , description_bytes_per_tile(description_bytes_per_tile)
+      , payload_bytes_per_tile(payload_bytes_per_tile)
   {}
 
   cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
   {
-    return is_primitive ? InitPrimitive(num_tiles, d_temp_storage, temp_storage_bytes)
-                        : InitStorage(num_tiles, d_temp_storage, temp_storage_bytes);
-  }
-
-  cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
-  {
-    return is_primitive ? AllocationSizePrimitive(num_tiles, temp_storage_bytes)
-                        : AllocationSizeStorage(num_tiles, temp_storage_bytes);
-  }
-
-  cudaError_t InitPrimitive(int, void* d_temp_storage, size_t)
-  {
-    d_tile_status = d_temp_storage;
+    void* allocations[3] = {};
+    auto status          = cub::detail::tile_state_init(
+      description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+    d_tile_status    = allocations[0];
+    d_tile_partial   = allocations[1];
+    d_tile_inclusive = allocations[2];
     return cudaSuccess;
   }
 
-  cudaError_t AllocationSizePrimitive(int num_tiles, size_t& d_temp_storage_bytes) const
+  cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
   {
-    d_temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * status_size;
+    temp_storage_bytes =
+      cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
     return cudaSuccess;
   }
-
-  cudaError_t InitStorage(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
-  {
-    cudaError_t error = cudaSuccess;
-    do
-    {
-      void* allocations[3] = {};
-      size_t allocation_sizes[3];
-
-      // bytes needed for tile status descriptors
-      allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size;
-
-      // bytes needed for partials
-      allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
-
-      // bytes needed for inclusives
-      allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
-
-      // Compute allocation pointers into the single storage blob
-      error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
-      // Alias the offsets
-      d_tile_status    = allocations[0];
-      d_tile_partial   = allocations[1];
-      d_tile_inclusive = allocations[2];
-    } while (0);
-
-    return error;
-  }
-
-  cudaError_t AllocationSizeStorage(int num_tiles, size_t& d_temp_storage_bytes) const
-  {
-    // Specify storage allocation requirements
-    size_t allocation_sizes[3];
-
-    // bytes needed for tile status descriptors
-    allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size;
-
-    // bytes needed for partials
-    allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
-
-    // bytes needed for inclusives
-    allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size;
-
-    // Set the necessary size of the blob
-    void* allocations[3] = {};
-    return CubDebug(cub::AliasTemporaries(nullptr, d_temp_storage_bytes, allocations, allocation_sizes));
-  }
 };
 
 template <auto* GetPolicy>
@@ -444,50 +403,29 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     constexpr size_t num_ptx_lto_args       = 3;
     const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
 
-    size_t status_size{0};
-    size_t uninitialized_size{0};
-    if (accum_t.type == cccl_type_enum::STORAGE)
-    {
-      std::string src = std::format(
-        "#include <cub/agent/single_pass_scan_operators.cuh>\n"
-        "#include <cub/util_type.cuh>\n"
-        "struct __align__({1}) storage_t {{\n"
-        "  char data[{0}];\n"
-        "}};\n"
-        "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{2}>::StatusWord);\n"
-        "__device__ size_t uninitialized_size = sizeof(cub::Uninitialized<{2}>);\n",
-        accum_t.size,
-        accum_t.alignment,
-        accum_cpp);
-      auto compile_result =
-        make_nvrtc_command_list()
-          .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"})
-          .compile_program({ptx_args, num_ptx_args})
-          .cleanup_program()
-          .finalize_program(num_ptx_lto_args, ptx_lopts);
-      auto ptx_code      = compile_result.cubin.get();
-      status_size        = scan::find_size_t(ptx_code, "status_size");
-      uninitialized_size = scan::find_size_t(ptx_code, "uninitialized_size");
-    }
-    else
-    {
-      std::string src = std::format(
-        "#include <cub/agent/single_pass_scan_operators.cuh>\n"
-        "#include <cub/util_type.cuh>\n"
-        "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{0}>::TxnWord);\n",
-        accum_cpp);
-      auto compile_result =
-        make_nvrtc_command_list()
-          .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"})
-          .compile_program({ptx_args, num_ptx_args})
-          .cleanup_program()
-          .finalize_program(num_ptx_lto_args, ptx_lopts);
-      auto ptx_code = compile_result.cubin.get();
-      status_size   = scan::find_size_t(ptx_code, "status_size");
-    }
-
-    bool is_primitive = not(accum_t.type == cccl_type_enum::STORAGE);
-    auto tile_state   = std::make_unique<scan::scan_tile_state>(is_primitive, status_size, uninitialized_size);
+    size_t description_bytes_per_tile;
+    size_t payload_bytes_per_tile;
+    std::string ptx_src = std::format(
+      "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+      "#include <cub/util_type.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;\n"
+      "__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;\n",
+      accum_t.size,
+      accum_t.alignment,
+      accum_cpp);
+    auto compile_result =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"})
+        .compile_program({ptx_args, num_ptx_args})
+        .cleanup_program()
+        .finalize_program(num_ptx_lto_args, ptx_lopts);
+    auto ptx_code              = compile_result.cubin.get();
+    description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
+    payload_bytes_per_tile     = scan::find_size_t(ptx_code, "payload_bytes_per_tile", 0);
+    auto tile_state = std::make_unique<scan::scan_tile_state>(description_bytes_per_tile, payload_bytes_per_tile);
 
     build->cc               = cc;
     build->cubin            = (void*) result.cubin.release();

From 4060f59581a26dd08219489f1f901dac3747c7bd Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Mon, 3 Feb 2025 06:56:27 -0500
Subject: [PATCH 14/27] Update cub/cub/thread/thread_store.cuh

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 cub/cub/thread/thread_store.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index d4859c2c174..176032df891 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -358,8 +358,7 @@ ThreadStore(T* ptr, T val, detail::constant_t<MODIFIER> /*modifier*/, ::cuda::st
 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val)
 {
-  ThreadStore(
-    itr, val, detail::constant_v<MODIFIER>, detail::bool_constant_v<::cuda::std::is_pointer_v<OutputIteratorT>>);
+  ThreadStore(itr, val, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<OutputIteratorT>::value>());
 }
 
 #endif // _CCCL_DOXYGEN_INVOKED

From 0a9b9b51955ef1b2a1d23f7ed7c23a0b0ea1d17d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 3 Feb 2025 10:25:02 -0500
Subject: [PATCH 15/27] No initialize-then-modify

---
 cub/cub/agent/single_pass_scan_operators.cuh | 39 +++++++-------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 7c3bbc3677e..a23487aed97 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -549,20 +549,14 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(i
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t
 tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles)
 {
-  // Specify storage allocation requirements
-  size_t allocation_sizes[3];
-
   int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
-
-  // bytes needed for tile status descriptors
-  allocation_sizes[0] = num_tile_states * bytes_per_description;
-
-  // bytes needed for partials
-  allocation_sizes[1] = num_tile_states * bytes_per_payload;
-
-  // bytes needed for inclusives
-  allocation_sizes[2] = num_tile_states * bytes_per_payload;
-
+  size_t allocation_sizes[]{
+    // bytes needed for tile status descriptors
+    static_cast<size_t>(num_tile_states * bytes_per_description),
+    // bytes needed for partials
+    static_cast<size_t>(num_tile_states * bytes_per_payload),
+    // bytes needed for inclusives
+    static_cast<size_t>(num_tile_states * bytes_per_payload)};
   // Set the necessary size of the blob
   size_t temp_storage_bytes = 0;
   void* allocations[3]      = {};
@@ -579,19 +573,14 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
   size_t temp_storage_bytes,
   void* (&allocations)[3])
 {
-  // Specify storage allocation requirements
-  size_t allocation_sizes[3];
-
   int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
-
-  // bytes needed for tile status descriptors
-  allocation_sizes[0] = num_tile_states * bytes_per_description;
-
-  // bytes needed for partials
-  allocation_sizes[1] = num_tile_states * bytes_per_payload;
-
-  // bytes needed for inclusives
-  allocation_sizes[2] = num_tile_states * bytes_per_payload;
+  size_t allocation_sizes[]{
+    // bytes needed for tile status descriptors
+    static_cast<size_t>(num_tile_states * bytes_per_description),
+    // bytes needed for partials
+    static_cast<size_t>(num_tile_states * bytes_per_payload),
+    // bytes needed for inclusives
+    static_cast<size_t>(num_tile_states * bytes_per_payload)};
 
   // Set the necessary size of the blob
   AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);

From 39d6a1637d43b3eac0c513b886f5cb6e4e3f4bf5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:28:57 -0500
Subject: [PATCH 16/27] Use enum rather than bool

---
 c/parallel/src/scan.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index e4e23cd0f09..1eb86eb64ab 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -471,7 +471,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan(
       ::cuda::std::size_t,
       void,
       scan::dynamic_scan_policy_t<&scan::get_policy>,
-      false,
+      cub::ForceInclusive::No,
       scan::scan_kernel_source,
       cub::detail::CudaDriverLauncherFactory>::
       Dispatch(

From 02f6512ff4a764a75c3354e52f3b3b2dbf817f69 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:51:40 -0500
Subject: [PATCH 17/27] Return a std::optional from find_size_t

---
 c/parallel/src/scan.cu | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 1eb86eb64ab..529364d4226 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -9,6 +9,7 @@
 
 #include <format>
 #include <iostream>
+#include <optional>
 #include <regex>
 #include <string>
 #include <type_traits>
@@ -162,7 +163,7 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
 
 static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
 
-size_t find_size_t(char* ptx, std::string_view name)
+std::optional<size_t> find_size_t(char* ptx, std::string_view name)
 {
   std::regex regex(std::format(ptx_u64_assignment_regex, name));
   std::cmatch match;
@@ -171,24 +172,7 @@ size_t find_size_t(char* ptx, std::string_view name)
     auto result = std::stoi(match[1].str());
     return result;
   }
-  else
-  {
-    throw std::runtime_error(std::format("Could not find {} in PTX code", name));
-  }
-}
-
-size_t find_size_t(char* ptx, std::string_view name, size_t default_value)
-{
-  std::regex regex(std::format(ptx_u64_assignment_regex, name));
-  std::cmatch match;
-  if (std::regex_search(ptx, match, regex))
-  {
-    return std::stoi(match[1].str());
-  }
-  else
-  {
-    return default_value;
-  }
+  return std::nullopt;
 }
 
 struct scan_tile_state
@@ -403,8 +387,6 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     constexpr size_t num_ptx_lto_args       = 3;
     const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
 
-    size_t description_bytes_per_tile;
-    size_t payload_bytes_per_tile;
     std::string ptx_src = std::format(
       "#include <cub/agent/single_pass_scan_operators.cuh>\n"
       "#include <cub/util_type.cuh>\n"
@@ -422,9 +404,21 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
         .compile_program({ptx_args, num_ptx_args})
         .cleanup_program()
         .finalize_program(num_ptx_lto_args, ptx_lopts);
-    auto ptx_code              = compile_result.cubin.get();
-    description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
-    payload_bytes_per_tile     = scan::find_size_t(ptx_code, "payload_bytes_per_tile", 0);
+    auto ptx_code = compile_result.cubin.get();
+
+    size_t description_bytes_per_tile;
+    size_t payload_bytes_per_tile;
+    auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
+    if (maybe_description_bytes_per_tile)
+    {
+      description_bytes_per_tile = maybe_description_bytes_per_tile.value();
+    }
+    else
+    {
+      throw std::runtime_error("Failed to find description_bytes_per_tile in PTX");
+    }
+    payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0);
+
     auto tile_state = std::make_unique<scan::scan_tile_state>(description_bytes_per_tile, payload_bytes_per_tile);
 
     build->cc               = cc;

From 2f2cea1f6d54488c102e604bc1e54df1cf34d715 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:52:43 -0500
Subject: [PATCH 18/27] Annotate arguments with their positions

---
 c/parallel/src/scan.cu | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 529364d4226..8632aef9664 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -149,16 +149,16 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
   auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t);
   return std::format(
     "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>",
-    chained_policy_t,
-    input_iterator_t,
-    output_iterator_t,
-    tile_state_t,
-    scan_op_t,
-    init_t,
-    offset_t,
-    accum_cpp_t,
-    "false", // for now, always exclusive
-    init_t);
+    chained_policy_t, // 0
+    input_iterator_t, // 1
+    output_iterator_t, // 2
+    tile_state_t, // 3
+    scan_op_t, // 4
+    init_t, // 5
+    offset_t, // 6
+    accum_cpp_t, // 7
+    "false", // 8 - for now, always exclusive
+    init_t); // 9
 }
 
 static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";

From 6ab362875f473a051cf00d41196040f017b90238 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:54:17 -0500
Subject: [PATCH 19/27] Minor improvements to command_list

---
 c/parallel/src/nvrtc/command_list.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
index 719d9c5bf88..f1003dcd18c 100644
--- a/c/parallel/src/nvrtc/command_list.h
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -156,16 +156,16 @@ struct nvrtc_command_list_visitor
 
     check(jitlink_error);
 
-    bool ptx    = false;
-    auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size);
+    bool output_ptx = false;
+    auto result     = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size);
     if (result != NVJITLINK_SUCCESS)
     {
-      ptx    = true;
-      result = nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size);
+      output_ptx = true;
+      check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size));
     }
     cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
 
-    if (ptx)
+    if (output_ptx)
     {
       check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get()));
     }

From ff416380eafce0d565eb5a04e5ab701ef451c91e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:59:15 -0500
Subject: [PATCH 20/27] Rename cubin->link_result

---
 c/parallel/src/for.cu               |  6 +++---
 c/parallel/src/nvrtc/command_list.h | 24 ++++++++++++------------
 c/parallel/src/reduce.cu            |  6 +++---
 c/parallel/src/scan.cu              |  8 ++++----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu
index 0abfb2a522c..2a5f412d43b 100644
--- a/c/parallel/src/for.cu
+++ b/c/parallel/src/for.cu
@@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
         .cleanup_program()
         .add_link({op.ltoir, op.ltoir_size});
 
-    nvrtc_cubin result{};
+    nvrtc_link_result result{};
 
     if (cccl_iterator_kind_t::iterator == d_data.type)
     {
@@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
       result = cl.finalize_program(num_lto_args, lopts);
     }
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = (void*) result.cubin.release();
+    build->cubin      = (void*) result.data.release();
     build->cubin_size = result.size;
   }
   catch (...)
diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
index f1003dcd18c..a3d3724b35f 100644
--- a/c/parallel/src/nvrtc/command_list.h
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -22,9 +22,9 @@
 #include <nvrtc.h>
 #include <util/errors.h>
 
-struct nvrtc_cubin
+struct nvrtc_link_result
 {
-  std::unique_ptr<char[]> cubin{};
+  std::unique_ptr<char[]> data{};
   size_t size;
 };
 
@@ -57,7 +57,7 @@ struct nvrtc_ltoir
 using nvrtc_ltoir_list = std::vector<nvrtc_ltoir>;
 struct nvrtc_jitlink_cleanup
 {
-  nvrtc_cubin& cubin_ref;
+  nvrtc_link_result& link_result_ref;
 };
 
 struct nvrtc_jitlink
@@ -157,21 +157,21 @@ struct nvrtc_command_list_visitor
     check(jitlink_error);
 
     bool output_ptx = false;
-    auto result     = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size);
+    auto result     = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size);
     if (result != NVJITLINK_SUCCESS)
     {
       output_ptx = true;
-      check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size));
+      check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size));
     }
-    cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
+    cleanup.link_result_ref.data = std::unique_ptr<char[]>(new char[cleanup.link_result_ref.size]);
 
     if (output_ptx)
     {
-      check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+      check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get()));
     }
     else
     {
-      check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+      check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get()));
     }
   }
 };
@@ -245,13 +245,13 @@ struct nvrtc_sm_top_level
   }
 
   // Execute steps and link unit
-  nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
+  nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
   {
-    nvrtc_cubin cubin{};
-    nvrtc_jitlink_cleanup cleanup{cubin};
+    nvrtc_link_result link_result{};
+    nvrtc_jitlink_cleanup cleanup{link_result};
     nvrtc_jitlink jl(numLtoOpts, ltoOpts);
     std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup)));
-    return cubin;
+    return link_result;
   }
 };
 
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index f39c8a9f944..bc38090bcc2 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -348,7 +348,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
       ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size});
     }
 
-    nvrtc_cubin result =
+    nvrtc_link_result result =
       make_nvrtc_command_list()
         .add_program(nvrtc_translation_unit{src.c_str(), name})
         .add_expression({single_tile_kernel_name})
@@ -362,14 +362,14 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
         .add_link_list(ltoir_list)
         .finalize_program(num_lto_args, lopts);
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(
       &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str()));
 
     build->cc               = cc;
-    build->cubin            = (void*) result.cubin.release();
+    build->cubin            = (void*) result.data.release();
     build->cubin_size       = result.size;
     build->accumulator_size = accum_t.size;
   }
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 8632aef9664..45201f4b760 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -366,7 +366,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
       ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size});
     }
 
-    nvrtc_cubin result =
+    nvrtc_link_result result =
       make_nvrtc_command_list()
         .add_program(nvrtc_translation_unit{src.c_str(), name})
         .add_expression({init_kernel_name})
@@ -378,7 +378,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
         .add_link_list(ltoir_list)
         .finalize_program(num_lto_args, lopts);
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str()));
 
@@ -404,7 +404,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
         .compile_program({ptx_args, num_ptx_args})
         .cleanup_program()
         .finalize_program(num_ptx_lto_args, ptx_lopts);
-    auto ptx_code = compile_result.cubin.get();
+    auto ptx_code = compile_result.data.get();
 
     size_t description_bytes_per_tile;
     size_t payload_bytes_per_tile;
@@ -422,7 +422,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     auto tile_state = std::make_unique<scan::scan_tile_state>(description_bytes_per_tile, payload_bytes_per_tile);
 
     build->cc               = cc;
-    build->cubin            = (void*) result.cubin.release();
+    build->cubin            = (void*) result.data.release();
     build->cubin_size       = result.size;
     build->accumulator_type = accum_t;
     build->tile_state       = (void*) tile_state.release();

From c4102fdfeddb6800e6faed589ac69a41a30d9e1a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:08:51 -0500
Subject: [PATCH 21/27] Add a TODO for removing extra compile step

---
 c/parallel/src/scan.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 45201f4b760..a81e428439a 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -161,6 +161,13 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
     init_t); // 9
 }
 
+// TODO: NVRTC doesn't currently support extracting basic type
+// information (e.g., type sizes and alignments) from compiled
+// LTO-IR. So we separately compile a small PTX file that defines the
+// necessary types and constants and grep it for the required
+// information. If/when NVRTC adds these features, we can remove this
+// extra compilation step and get the information directly from the
+// LTO-IR.
 static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
 
 std::optional<size_t> find_size_t(char* ptx, std::string_view name)

From e3a2e7596ab6e228297c68d73a9c56bef4dcfb46 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 7 Feb 2025 06:58:16 -0500
Subject: [PATCH 22/27] Bad merge

---
 cub/cub/thread/thread_store.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index 176032df891..d4859c2c174 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -358,7 +358,8 @@ ThreadStore(T* ptr, T val, detail::constant_t<MODIFIER> /*modifier*/, ::cuda::st
 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val)
 {
-  ThreadStore(itr, val, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<OutputIteratorT>::value>());
+  ThreadStore(
+    itr, val, detail::constant_v<MODIFIER>, detail::bool_constant_v<::cuda::std::is_pointer_v<OutputIteratorT>>);
 }
 
 #endif // _CCCL_DOXYGEN_INVOKED

From fd63f70abe633faf2c37b477e622fd6d07746306 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 7 Feb 2025 07:28:51 -0500
Subject: [PATCH 23/27] Pass thrust path to PTX compile step

---
 c/parallel/src/scan.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index a81e428439a..1777876182f 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -389,8 +389,9 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str()));
 
-    constexpr size_t num_ptx_args           = 5;
-    const char* ptx_args[num_ptx_args]      = {arch.c_str(), cub_path, libcudacxx_path, "-rdc=true", "-dlto"};
+    constexpr size_t num_ptx_args      = 7;
+    const char* ptx_args[num_ptx_args] = {
+      arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
     constexpr size_t num_ptx_lto_args       = 3;
     const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
 

From 9fe8dfed5fa5660f5b0426313fa6aeb10724d28b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 10 Feb 2025 10:43:45 -0500
Subject: [PATCH 24/27] Fixes following merge from main

---
 c/parallel/src/kernels/operators.cpp | 2 +-
 c/parallel/src/kernels/operators.h   | 2 +-
 c/parallel/src/merge_sort.cu         | 6 +++---
 c/parallel/src/reduce.cu             | 2 +-
 c/parallel/src/scan.cu               | 8 +++++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/c/parallel/src/kernels/operators.cpp b/c/parallel/src/kernels/operators.cpp
index 87dc5ce6d30..bbde349993e 100644
--- a/c/parallel/src/kernels/operators.cpp
+++ b/c/parallel/src/kernels/operators.cpp
@@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper
       : std::format(stateful_binary_op_template, return_type));
 }
 
-std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation)
+std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation)
 {
   return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T");
 }
diff --git a/c/parallel/src/kernels/operators.h b/c/parallel/src/kernels/operators.h
index 2e8e11df39e..2e269857572 100644
--- a/c/parallel/src/kernels/operators.h
+++ b/c/parallel/src/kernels/operators.h
@@ -14,6 +14,6 @@
 
 #include <cccl/c/types.h>
 
-std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation);
+std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation);
 
 std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation);
diff --git a/c/parallel/src/merge_sort.cu b/c/parallel/src/merge_sort.cu
index 4ff17376e4f..edb03ceca17 100644
--- a/c/parallel/src/merge_sort.cu
+++ b/c/parallel/src/merge_sort.cu
@@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
       ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size});
     }
 
-    nvrtc_cubin result =
+    nvrtc_link_result result =
       make_nvrtc_command_list()
         .add_program(nvrtc_translation_unit{src.c_str(), name})
         .add_expression({block_sort_kernel_name})
@@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
         .add_link_list(ltoir_list)
         .finalize_program(num_lto_args, lopts);
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = (void*) result.cubin.release();
+    build->cubin      = (void*) result.data.release();
     build->cubin_size = result.size;
   }
   catch (const std::exception& exc)
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index bc38090bcc2..09ee9268e92 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -273,7 +273,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
     const std::string output_iterator_src =
       make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
 
-    const std::string op_src = make_kernel_user_arithmetic_operator(accum_cpp, op);
+    const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op);
 
     const std::string src = std::format(
       "#include <cub/block/block_reduce.cuh>\n"
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 1777876182f..844097a4867 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -294,8 +294,10 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     const auto input_it_value_t  = cccl_type_enum_to_string(input_it.value_type.type);
     const auto offset_t          = cccl_type_enum_to_string(cccl_type_enum::UINT64);
 
-    const std::string input_iterator_src  = make_kernel_input_iterator(offset_t, input_it_value_t, input_it);
-    const std::string output_iterator_src = make_kernel_output_iterator(offset_t, accum_cpp, output_it);
+    const std::string input_iterator_src =
+      make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it);
+    const std::string output_iterator_src =
+      make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
 
     const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op);
 
@@ -472,8 +474,8 @@ extern "C" CCCL_C_API CUresult cccl_device_scan(
       indirect_arg_t,
       ::cuda::std::size_t,
       void,
-      scan::dynamic_scan_policy_t<&scan::get_policy>,
       cub::ForceInclusive::No,
+      scan::dynamic_scan_policy_t<&scan::get_policy>,
       scan::scan_kernel_source,
       cub::detail::CudaDriverLauncherFactory>::
       Dispatch(

From f0b1ed8c86ccc9219c301f50f364d844462d77d5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 10 Feb 2025 19:25:34 -0500
Subject: [PATCH 25/27] Return error from AliasTemporaries

---
 cub/cub/agent/single_pass_scan_operators.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index a23487aed97..2c41f8f6036 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -583,9 +583,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
     static_cast<size_t>(num_tile_states * bytes_per_payload)};
 
   // Set the necessary size of the blob
-  AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
-
-  return cudaSuccess;
+  return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
 }
 
 } // namespace detail

From f20219b479501b55066c5ecc2f7767b108fbe53c Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 10 Feb 2025 19:25:42 -0500
Subject: [PATCH 26/27] Fix SFINAE

---
 cub/cub/device/dispatch/kernels/scan.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh
index 7cb71ca49e7..b5b89d68756 100644
--- a/cub/cub/device/dispatch/kernels/scan.cuh
+++ b/cub/cub/device/dispatch/kernels/scan.cuh
@@ -162,8 +162,7 @@ template <typename ChainedPolicyT,
           typename OffsetT,
           typename AccumT,
           bool ForceInclusive,
-          typename RealInitValueT = ::cuda::std::
-            _If<::cuda::std::is_void_v<typename InitValueT::value_type>, InitValueT, typename InitValueT::value_type>>
+          typename RealInitValueT = typename InitValueT::value_type>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     InputIteratorT d_in,

From 7a83fbc990af0395fde43721a72f31d931d8f0fc Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 10 Feb 2025 19:25:55 -0500
Subject: [PATCH 27/27] Store description/payload bytes_per_tile directly in
 the build obj

---
 c/parallel/include/cccl/c/scan.h |  3 ++-
 c/parallel/src/scan.cu           | 20 +++++++-------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h
index df9f09588a5..7dc923df7e0 100644
--- a/c/parallel/include/cccl/c/scan.h
+++ b/c/parallel/include/cccl/c/scan.h
@@ -27,7 +27,8 @@ struct cccl_device_scan_build_result_t
   cccl_type_info accumulator_type;
   CUkernel init_kernel;
   CUkernel scan_kernel;
-  void* tile_state;
+  size_t description_bytes_per_tile;
+  size_t payload_bytes_per_tile;
 };
 
 extern "C" CCCL_C_API CUresult cccl_device_scan_build(
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 844097a4867..d5209a58046 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -261,8 +261,7 @@ struct scan_kernel_source
   }
   scan_tile_state TileState()
   {
-    auto result = (reinterpret_cast<scan_tile_state*>(build.tile_state));
-    return *result;
+    return {build.description_bytes_per_tile, build.payload_bytes_per_tile};
   }
 };
 
@@ -429,13 +428,12 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build(
     }
     payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0);
 
-    auto tile_state = std::make_unique<scan::scan_tile_state>(description_bytes_per_tile, payload_bytes_per_tile);
-
-    build->cc               = cc;
-    build->cubin            = (void*) result.data.release();
-    build->cubin_size       = result.size;
-    build->accumulator_type = accum_t;
-    build->tile_state       = (void*) tile_state.release();
+    build->cc                         = cc;
+    build->cubin                      = (void*) result.data.release();
+    build->cubin_size                 = result.size;
+    build->accumulator_type           = accum_t;
+    build->description_bytes_per_tile = description_bytes_per_tile;
+    build->payload_bytes_per_tile     = payload_bytes_per_tile;
   }
   catch (const std::exception& exc)
   {
@@ -519,12 +517,8 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_r
     {
       return CUDA_ERROR_INVALID_VALUE;
     }
-
     std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
     check(cuLibraryUnload(bld_ptr->library));
-
-    std::unique_ptr<scan::scan_tile_state> tile_state(reinterpret_cast<scan::scan_tile_state*>(bld_ptr->tile_state));
-    tile_state.reset();
   }
   catch (const std::exception& exc)
   {