diff --git a/c/parallel/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h
index f1e875087a7..2366024a532 100644
--- a/c/parallel/include/cccl/c/reduce.h
+++ b/c/parallel/include/cccl/c/reduce.h
@@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
   cccl_value_t init,
   CUstream stream) noexcept;
 
-extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept;
diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h
new file mode 100644
index 00000000000..7dc923df7e0
--- /dev/null
+++ b/c/parallel/include/cccl/c/scan.h
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA Core Compute Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
+
+#include <cuda.h>
+
+#include <cccl/c/types.h>
+
+struct cccl_device_scan_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  cccl_type_info accumulator_type;
+  CUkernel init_kernel;
+  CUkernel scan_kernel;
+  size_t description_bytes_per_tile;
+  size_t payload_bytes_per_tile;
+};
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_build(
+  cccl_device_scan_build_result_t* build,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_scan(
+  cccl_device_scan_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept;
diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu
index 0abfb2a522c..2a5f412d43b 100644
--- a/c/parallel/src/for.cu
+++ b/c/parallel/src/for.cu
@@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
         .cleanup_program()
         .add_link({op.ltoir, op.ltoir_size});
 
-    nvrtc_cubin result{};
+    nvrtc_link_result result{};
 
     if (cccl_iterator_kind_t::iterator == d_data.type)
     {
@@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
       result = cl.finalize_program(num_lto_args, lopts);
     }
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = (void*) result.cubin.release();
+    build->cubin      = (void*) result.data.release();
     build->cubin_size = result.size;
   }
   catch (...)
diff --git a/c/parallel/src/kernels/operators.cpp b/c/parallel/src/kernels/operators.cpp
index 87dc5ce6d30..bbde349993e 100644
--- a/c/parallel/src/kernels/operators.cpp
+++ b/c/parallel/src/kernels/operators.cpp
@@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper
       : std::format(stateful_binary_op_template, return_type));
 }
 
-std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation)
+std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation)
 {
   return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T");
 }
diff --git a/c/parallel/src/kernels/operators.h b/c/parallel/src/kernels/operators.h
index 2e8e11df39e..2e269857572 100644
--- a/c/parallel/src/kernels/operators.h
+++ b/c/parallel/src/kernels/operators.h
@@ -14,6 +14,6 @@
 
 #include <cccl/c/types.h>
 
-std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation);
+std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation);
 
 std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation);
diff --git a/c/parallel/src/merge_sort.cu b/c/parallel/src/merge_sort.cu
index 4ff17376e4f..edb03ceca17 100644
--- a/c/parallel/src/merge_sort.cu
+++ b/c/parallel/src/merge_sort.cu
@@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
       ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size});
     }
 
-    nvrtc_cubin result =
+    nvrtc_link_result result =
       make_nvrtc_command_list()
         .add_program(nvrtc_translation_unit{src.c_str(), name})
         .add_expression({block_sort_kernel_name})
@@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
         .add_link_list(ltoir_list)
         .finalize_program(num_lto_args, lopts);
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str()));
 
     build->cc         = cc;
-    build->cubin      = (void*) result.cubin.release();
+    build->cubin      = (void*) result.data.release();
     build->cubin_size = result.size;
   }
   catch (const std::exception& exc)
diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
index 77d83e86d47..a3d3724b35f 100644
--- a/c/parallel/src/nvrtc/command_list.h
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -22,9 +22,9 @@
 #include <nvrtc.h>
 #include <util/errors.h>
 
-struct nvrtc_cubin
+struct nvrtc_link_result
 {
-  std::unique_ptr<char[]> cubin{};
+  std::unique_ptr<char[]> data{};
   size_t size;
 };
 
@@ -57,7 +57,7 @@ struct nvrtc_ltoir
 using nvrtc_ltoir_list = std::vector<nvrtc_ltoir>;
 struct nvrtc_jitlink_cleanup
 {
-  nvrtc_cubin& cubin_ref;
+  nvrtc_link_result& link_result_ref;
 };
 
 struct nvrtc_jitlink
@@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor
 
     check(jitlink_error);
 
-    check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size));
-    cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
-    check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
+    bool output_ptx = false;
+    auto result     = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size);
+    if (result != NVJITLINK_SUCCESS)
+    {
+      output_ptx = true;
+      check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size));
+    }
+    cleanup.link_result_ref.data = std::unique_ptr<char[]>(new char[cleanup.link_result_ref.size]);
+
+    if (output_ptx)
+    {
+      check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get()));
+    }
+    else
+    {
+      check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get()));
+    }
   }
 };
 
@@ -231,13 +245,13 @@ struct nvrtc_sm_top_level
   }
 
   // Execute steps and link unit
-  nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
+  nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
   {
-    nvrtc_cubin cubin{};
-    nvrtc_jitlink_cleanup cleanup{cubin};
+    nvrtc_link_result link_result{};
+    nvrtc_jitlink_cleanup cleanup{link_result};
     nvrtc_jitlink jl(numLtoOpts, ltoOpts);
     std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup)));
-    return cubin;
+    return link_result;
   }
 };
 
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index da866bd6a76..09ee9268e92 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -40,6 +40,12 @@ static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "O
 struct nothing_t
 {};
 
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+namespace reduce
+{
+
 struct reduce_runtime_tuning_policy
 {
   int block_size;
@@ -107,9 +113,6 @@ static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*i
   return init.type;
 }
 
-struct input_iterator_state_t;
-struct output_iterator_t;
-
 std::string get_input_iterator_name()
 {
   std::string iterator_t;
@@ -201,6 +204,44 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it
     transform_op_t);
 }
 
+template <auto* GetPolicy>
+struct dynamic_reduce_policy_t
+{
+  using MaxPolicy = dynamic_reduce_policy_t;
+
+  template <typename F>
+  cudaError_t Invoke(int device_ptx_version, F& op)
+  {
+    return op.template Invoke<reduce_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
+  }
+
+  cccl_type_info accumulator_type;
+};
+
+struct reduce_kernel_source
+{
+  cccl_device_reduce_build_result_t& build;
+
+  std::size_t AccumSize() const
+  {
+    return build.accumulator_size;
+  }
+  CUkernel SingleTileKernel() const
+  {
+    return build.single_tile_kernel;
+  }
+  CUkernel SingleTileSecondKernel() const
+  {
+    return build.single_tile_second_kernel;
+  }
+  CUkernel ReductionKernel() const
+  {
+    return build.reduction_kernel;
+  }
+};
+
+} // namespace reduce
+
 extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   cccl_device_reduce_build_result_t* build,
   cccl_iterator_t input_it,
@@ -220,19 +261,19 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   {
     const char* name = "test";
 
-    const int cc                              = cc_major * 10 + cc_minor;
-    const cccl_type_info accum_t              = get_accumulator_type(op, input_it, init);
-    const reduce_runtime_tuning_policy policy = get_policy(cc, accum_t);
-    const auto accum_cpp                      = cccl_type_enum_to_string(accum_t.type);
-    const auto input_it_value_t               = cccl_type_enum_to_string(input_it.value_type.type);
-    const auto offset_t                       = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+    const int cc                 = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init);
+    const auto policy            = reduce::get_policy(cc, accum_t);
+    const auto accum_cpp         = cccl_type_enum_to_string(accum_t.type);
+    const auto input_it_value_t  = cccl_type_enum_to_string(input_it.value_type.type);
+    const auto offset_t          = cccl_type_enum_to_string(cccl_type_enum::UINT64);
 
     const std::string input_iterator_src =
       make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it);
     const std::string output_iterator_src =
       make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
 
-    const std::string op_src = make_kernel_user_arithmetic_operator(accum_cpp, op);
+    const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op);
 
     const std::string src = std::format(
       "#include <cub/block/block_reduce.cuh>\n"
@@ -271,9 +312,10 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
     fflush(stdout);
 #endif
 
-    std::string single_tile_kernel_name        = get_single_tile_kernel_name(input_it, output_it, op, init, false);
-    std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true);
-    std::string reduction_kernel_name          = get_device_reduce_kernel_name(op, input_it, init);
+    std::string single_tile_kernel_name = reduce::get_single_tile_kernel_name(input_it, output_it, op, init, false);
+    std::string single_tile_second_kernel_name =
+      reduce::get_single_tile_kernel_name(input_it, output_it, op, init, true);
+    std::string reduction_kernel_name = reduce::get_device_reduce_kernel_name(op, input_it, init);
     std::string single_tile_kernel_lowered_name;
     std::string single_tile_second_kernel_lowered_name;
     std::string reduction_kernel_lowered_name;
@@ -306,7 +348,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
       ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size});
     }
 
-    nvrtc_cubin result =
+    nvrtc_link_result result =
       make_nvrtc_command_list()
         .add_program(nvrtc_translation_unit{src.c_str(), name})
         .add_expression({single_tile_kernel_name})
@@ -320,14 +362,14 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
         .add_link_list(ltoir_list)
         .finalize_program(num_lto_args, lopts);
 
-    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(
       &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str()));
 
     build->cc               = cc;
-    build->cubin            = (void*) result.cubin.release();
+    build->cubin            = (void*) result.data.release();
     build->cubin_size       = result.size;
     build->accumulator_size = accum_t.size;
   }
@@ -342,42 +384,6 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
   return error;
 }
 
-template <auto* GetPolicy>
-struct dynamic_reduce_policy_t
-{
-  using MaxPolicy = dynamic_reduce_policy_t;
-
-  template <typename F>
-  cudaError_t Invoke(int device_ptx_version, F& op)
-  {
-    return op.template Invoke<reduce_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
-  }
-
-  cccl_type_info accumulator_type;
-};
-
-struct reduce_kernel_source
-{
-  cccl_device_reduce_build_result_t& build;
-
-  std::size_t AccumSize() const
-  {
-    return build.accumulator_size;
-  }
-  CUkernel SingleTileKernel() const
-  {
-    return build.single_tile_kernel;
-  }
-  CUkernel SingleTileSecondKernel() const
-  {
-    return build.single_tile_second_kernel;
-  }
-  CUkernel ReductionKernel() const
-  {
-    return build.reduction_kernel;
-  }
-};
-
 extern "C" CCCL_C_API CUresult cccl_device_reduce(
   cccl_device_reduce_build_result_t build,
   void* d_temp_storage,
@@ -404,9 +410,9 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
                         indirect_arg_t,
                         indirect_arg_t,
                         void,
-                        dynamic_reduce_policy_t<&get_policy>,
+                        reduce::dynamic_reduce_policy_t<&reduce::get_policy>,
                         ::cuda::std::__identity,
-                        reduce_kernel_source,
+                        reduce::reduce_kernel_source,
                         cub::detail::CudaDriverLauncherFactory>::
       Dispatch(
         d_temp_storage,
@@ -420,7 +426,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
         {},
         {build},
         cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-        {get_accumulator_type(op, d_in, init)});
+        {reduce::get_accumulator_type(op, d_in, init)});
   }
   catch (const std::exception& exc)
   {
@@ -439,7 +445,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
   return error;
 }
 
-extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr)
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept
 {
   try
   {
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
new file mode 100644
index 00000000000..d5209a58046
--- /dev/null
+++ b/c/parallel/src/scan.cu
@@ -0,0 +1,532 @@
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/launcher/cuda_driver.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_temporary_storage.cuh>
+#include <cub/util_type.cuh>
+
+#include <format>
+#include <iostream>
+#include <optional>
+#include <regex>
+#include <string>
+#include <type_traits>
+
+#include "cub/util_device.cuh"
+#include "kernels/iterators.h"
+#include "kernels/operators.h"
+#include "util/context.h"
+#include "util/errors.h"
+#include "util/indirect_arg.h"
+#include "util/types.h"
+#include <cccl/c/scan.h>
+#include <nvrtc.h>
+#include <nvrtc/command_list.h>
+
+struct op_wrapper;
+struct device_scan_policy;
+using OffsetT = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+namespace scan
+{
+
+struct scan_runtime_tuning_policy
+{
+  int block_size;
+  int items_per_thread;
+  cub::CacheLoadModifier load_modifier;
+
+  scan_runtime_tuning_policy Scan() const
+  {
+    return *this;
+  }
+
+  int ItemsPerThread() const
+  {
+    return items_per_thread;
+  }
+
+  int BlockThreads() const
+  {
+    return block_size;
+  }
+
+  cub::CacheLoadModifier LoadModifier() const
+  {
+    return load_modifier;
+  }
+
+  void CheckLoadModifier() const
+  {
+    if (LoadModifier() == cub::CacheLoadModifier::LOAD_LDG)
+    {
+      throw std::runtime_error("The memory consistency model does not apply to texture "
+                               "accesses");
+    }
+  }
+};
+
+template <typename Tuning, int N>
+Tuning find_tuning(int cc, const Tuning (&tunings)[N])
+{
+  for (const Tuning& tuning : tunings)
+  {
+    if (cc >= tuning.cc)
+    {
+      return tuning;
+    }
+  }
+
+  return tunings[N - 1];
+}
+
+scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/)
+{
+  // TODO: we should update this once we figure out a way to reuse
+  // tuning logic from C++. Alternately, we should implement
+  // something better than a hardcoded default:
+  return {128, 4, cub::LOAD_DEFAULT};
+}
+
+static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
+{
+  // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
+  //      so switching back to the old accumulator type logic for now
+  return init.type;
+}
+
+std::string get_input_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<input_iterator_state_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_output_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<output_iterator_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string
+get_init_kernel_name(cccl_iterator_t input_it, cccl_iterator_t /*output_it*/, cccl_op_t op, cccl_value_t init)
+{
+  const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  return std::format("cub::detail::scan::DeviceScanInitKernel<cub::ScanTileState<{0}>>", accum_cpp_t);
+}
+
+std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_scan_policy>(&chained_policy_t));
+
+  const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  const std::string input_iterator_t =
+    (input_it.type == cccl_iterator_kind_t::pointer //
+       ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+       : scan::get_input_iterator_name());
+  const std::string output_iterator_t =
+    output_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(output_it.value_type.type, true) //
+      : scan::get_output_iterator_name();
+  const std::string init_t = cccl_type_enum_to_name(init.type.type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string scan_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&scan_op_t));
+
+  auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t);
+  return std::format(
+    "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>",
+    chained_policy_t, // 0
+    input_iterator_t, // 1
+    output_iterator_t, // 2
+    tile_state_t, // 3
+    scan_op_t, // 4
+    init_t, // 5
+    offset_t, // 6
+    accum_cpp_t, // 7
+    "false", // 8 - for now, always exclusive
+    init_t); // 9
+}
+
+// TODO: NVRTC doesn't currently support extracting basic type
+// information (e.g., type sizes and alignments) from compiled
+// LTO-IR. So we separately compile a small PTX file that defines the
+// necessary types and constants and grep it for the required
+// information. If/when NVRTC adds these features, we can remove this
+// extra compilation step and get the information directly from the
+// LTO-IR.
+static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
+
+std::optional<size_t> find_size_t(char* ptx, std::string_view name)
+{
+  std::regex regex(std::format(ptx_u64_assignment_regex, name));
+  std::cmatch match;
+  if (std::regex_search(ptx, match, regex))
+  {
+    auto result = std::stoi(match[1].str());
+    return result;
+  }
+  return std::nullopt;
+}
+
+struct scan_tile_state
+{
+  // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except
+  // that it accepts the acummulator type as a runtime parameter rather than being
+  // templated on it.
+  //
+  // Both specializations ScanTileStateT<T, true> and ScanTileStateT<T, false> - where the
+  // bool parameter indicates whether `T` is primitive - are combined into a single type.
+
+  void* d_tile_status; // d_tile_descriptors
+  void* d_tile_partial;
+  void* d_tile_inclusive;
+
+  size_t description_bytes_per_tile;
+  size_t payload_bytes_per_tile;
+
+  scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile)
+      : d_tile_status(nullptr)
+      , d_tile_partial(nullptr)
+      , d_tile_inclusive(nullptr)
+      , description_bytes_per_tile(description_bytes_per_tile)
+      , payload_bytes_per_tile(payload_bytes_per_tile)
+  {}
+
+  cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
+  {
+    void* allocations[3] = {};
+    auto status          = cub::detail::tile_state_init(
+      description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+    d_tile_status    = allocations[0];
+    d_tile_partial   = allocations[1];
+    d_tile_inclusive = allocations[2];
+    return cudaSuccess;
+  }
+
+  cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
+  {
+    temp_storage_bytes =
+      cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
+    return cudaSuccess;
+  }
+};
+
+template <auto* GetPolicy>
+struct dynamic_scan_policy_t
+{
+  using MaxPolicy = dynamic_scan_policy_t;
+
+  template <typename F>
+  cudaError_t Invoke(int device_ptx_version, F& op)
+  {
+    return op.template Invoke<scan_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
+  }
+
+  cccl_type_info accumulator_type;
+};
+
+struct scan_kernel_source
+{
+  cccl_device_scan_build_result_t& build;
+
+  std::size_t AccumSize() const
+  {
+    return build.accumulator_type.size;
+  }
+  CUkernel InitKernel() const
+  {
+    return build.init_kernel;
+  }
+  CUkernel ScanKernel() const
+  {
+    return build.scan_kernel;
+  }
+  scan_tile_state TileState()
+  {
+    return {build.description_bytes_per_tile, build.payload_bytes_per_tile};
+  }
+};
+
+} // namespace scan
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_build(
+  cccl_device_scan_build_result_t* build,
+  cccl_iterator_t input_it,
+  cccl_iterator_t output_it,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    const char* name = "test";
+
+    const int cc                 = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init);
+    const auto policy            = scan::get_policy(cc, accum_t);
+    const auto accum_cpp         = cccl_type_enum_to_string(accum_t.type);
+    const auto input_it_value_t  = cccl_type_enum_to_string(input_it.value_type.type);
+    const auto offset_t          = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string input_iterator_src =
+      make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it);
+    const std::string output_iterator_src =
+      make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
+
+    const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op);
+
+    const std::string src = std::format(
+      "#include <cub/block/block_scan.cuh>\n"
+      "#include <cub/device/dispatch/kernels/scan.cuh>\n"
+      "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "{4}\n"
+      "{5}\n"
+      "struct agent_policy_t {{\n"
+      "  static constexpr int ITEMS_PER_THREAD = {2};\n"
+      "  static constexpr int BLOCK_THREADS = {3};\n"
+      "  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_WARP_TRANSPOSE;\n"
+      "  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;\n"
+      "  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = cub::BLOCK_STORE_WARP_TRANSPOSE;\n"
+      "  static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS;\n"
+      "  struct detail {{\n"
+      "    using delay_constructor_t = cub::detail::default_delay_constructor_t<{7}>;\n"
+      "  }};\n"
+      "}};\n"
+      "struct device_scan_policy {{\n"
+      "  struct ActivePolicy {{\n"
+      "    using ScanPolicyT = agent_policy_t;\n"
+      "  }};\n"
+      "}};\n"
+      "{6};\n",
+      input_it.value_type.size, // 0
+      input_it.value_type.alignment, // 1
+      policy.items_per_thread, // 2
+      policy.block_size, // 3
+      input_iterator_src, // 4
+      output_iterator_src, // 5
+      op_src, // 6
+      accum_cpp); // 7
+
+#if false // CCCL_DEBUGGING_SWITCH
+    fflush(stderr);
+    printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str());
+    fflush(stdout);
+#endif
+
+    std::string init_kernel_name = scan::get_init_kernel_name(input_it, output_it, op, init);
+    std::string scan_kernel_name = scan::get_scan_kernel_name(input_it, output_it, op, init);
+    std::string init_kernel_lowered_name;
+    std::string scan_kernel_lowered_name;
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr size_t num_args  = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    constexpr size_t num_lto_args   = 2;
+    const char* lopts[num_lto_args] = {"-lto", arch.c_str()};
+
+    // Collect all LTO-IRs to be linked.
+    nvrtc_ltoir_list ltoir_list;
+    auto ltoir_list_append = [&ltoir_list](nvrtc_ltoir lto) {
+      if (lto.ltsz)
+      {
+        ltoir_list.push_back(std::move(lto));
+      }
+    };
+    ltoir_list_append({op.ltoir, op.ltoir_size});
+    if (cccl_iterator_kind_t::iterator == input_it.type)
+    {
+      ltoir_list_append({input_it.advance.ltoir, input_it.advance.ltoir_size});
+      ltoir_list_append({input_it.dereference.ltoir, input_it.dereference.ltoir_size});
+    }
+    if (cccl_iterator_kind_t::iterator == output_it.type)
+    {
+      ltoir_list_append({output_it.advance.ltoir, output_it.advance.ltoir_size});
+      ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size});
+    }
+
+    nvrtc_link_result result =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{src.c_str(), name})
+        .add_expression({init_kernel_name})
+        .add_expression({scan_kernel_name})
+        .compile_program({args, num_args})
+        .get_name({init_kernel_name, init_kernel_lowered_name})
+        .get_name({scan_kernel_name, scan_kernel_lowered_name})
+        .cleanup_program()
+        .add_link_list(ltoir_list)
+        .finalize_program(num_lto_args, lopts);
+
+    cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str()));
+    check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str()));
+
+    constexpr size_t num_ptx_args      = 7;
+    const char* ptx_args[num_ptx_args] = {
+      arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+    constexpr size_t num_ptx_lto_args       = 3;
+    const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
+
+    std::string ptx_src = std::format(
+      "#include <cub/agent/single_pass_scan_operators.cuh>\n"
+      "#include <cub/util_type.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;\n"
+      "__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;\n",
+      accum_t.size,
+      accum_t.alignment,
+      accum_cpp);
+    auto compile_result =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"})
+        .compile_program({ptx_args, num_ptx_args})
+        .cleanup_program()
+        .finalize_program(num_ptx_lto_args, ptx_lopts);
+    auto ptx_code = compile_result.data.get();
+
+    size_t description_bytes_per_tile;
+    size_t payload_bytes_per_tile;
+    auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
+    if (maybe_description_bytes_per_tile)
+    {
+      description_bytes_per_tile = maybe_description_bytes_per_tile.value();
+    }
+    else
+    {
+      throw std::runtime_error("Failed to find description_bytes_per_tile in PTX");
+    }
+    payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0);
+
+    build->cc                         = cc;
+    build->cubin                      = (void*) result.data.release();
+    build->cubin_size                 = result.size;
+    build->accumulator_type           = accum_t;
+    build->description_bytes_per_tile = description_bytes_per_tile;
+    build->payload_bytes_per_tile     = payload_bytes_per_tile;
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan_build(): %s\n", exc.what());
+    fflush(stdout);
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_scan(
+  cccl_device_scan_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+  try
+  {
+    pushed = try_push_context();
+
+    CUdevice cu_device;
+    check(cuCtxGetDevice(&cu_device));
+    auto cuda_error = cub::DispatchScan<
+      indirect_arg_t,
+      indirect_arg_t,
+      indirect_arg_t,
+      indirect_arg_t,
+      ::cuda::std::size_t,
+      void,
+      cub::ForceInclusive::No,
+      scan::dynamic_scan_policy_t<&scan::get_policy>,
+      scan::scan_kernel_source,
+      cub::detail::CudaDriverLauncherFactory>::
+      Dispatch(
+        d_temp_storage,
+        *temp_storage_bytes,
+        d_in,
+        d_out,
+        op,
+        init,
+        num_items,
+        stream,
+        {build},
+        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
+        {scan::get_accumulator_type(op, d_in, init)});
+    if (cuda_error != cudaSuccess)
+    {
+      const char* errorString = cudaGetErrorString(cuda_error); // Get the error string
+      std::cerr << "CUDA error: " << errorString << std::endl;
+    }
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan(): %s\n", exc.what());
+    fflush(stdout);
+    error = CUDA_ERROR_UNKNOWN;
+  }
+  if (pushed)
+  {
+    CUcontext cu_context;
+    cuCtxPopCurrent(&cu_context);
+  }
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+  }
+  catch (const std::exception& exc)
+  {
+    fflush(stderr);
+    printf("\nEXCEPTION in cccl_device_scan_cleanup(): %s\n", exc.what());
+    fflush(stdout);
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}
diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp
new file mode 100644
index 00000000000..9ce88835caf
--- /dev/null
+++ b/c/parallel/test/test_scan.cpp
@@ -0,0 +1,237 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "test_util.h"
+
+void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init)
+{
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  const int cc_major = deviceProp.major;
+  const int cc_minor = deviceProp.minor;
+
+  const char* cub_path        = TEST_CUB_PATH;
+  const char* thrust_path     = TEST_THRUST_PATH;
+  const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
+  const char* ctk_path        = TEST_CTK_PATH;
+
+  cccl_device_scan_build_result_t build;
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_scan_build(
+            &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
+
+  const std::string sass = inspect_sass(build.cubin, build.cubin_size);
+
+  REQUIRE(sass.find("LDL") == std::string::npos);
+  REQUIRE(sass.find("STL") == std::string::npos);
+
+  size_t temp_storage_bytes = 0;
+  REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+
+  pointer_t<uint8_t> temp_storage(temp_storage_bytes);
+
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_scan(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+  REQUIRE(CUDA_SUCCESS == cccl_device_scan_cleanup(&build));
+}
+
+using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
+TEMPLATE_LIST_TEST_CASE("Scan works with integral types", "[scan]", integral_types)
+{
+  const std::size_t num_items       = GENERATE(0, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                    = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
+  const std::vector<TestType> input = generate<TestType>(num_items);
+  const std::vector<TestType> output(num_items, 0);
+  pointer_t<TestType> input_ptr(input);
+  pointer_t<TestType> output_ptr(output);
+  value_t<TestType> init{TestType{42}};
+
+  scan(input_ptr, output_ptr, num_items, op, init);
+
+  std::vector<TestType> expected(num_items, 0);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<TestType>(output_ptr));
+  }
+}
+
+struct pair
+{
+  short a;
+  size_t b;
+
+  bool operator==(const pair& other) const
+  {
+    return a == other.a && b == other.b;
+  }
+};
+
+TEST_CASE("Scan works with custom types", "[scan]")
+{
+  const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation(
+    "op",
+    "struct pair { short a; size_t b; };\n"
+    "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n"
+    "  return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n"
+    "}");
+  const std::vector<short> a  = generate<short>(num_items);
+  const std::vector<size_t> b = generate<size_t>(num_items);
+  std::vector<pair> input(num_items);
+  std::vector<pair> output(num_items);
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    input[i] = pair{a[i], b[i]};
+  }
+  pointer_t<pair> input_ptr(input);
+  pointer_t<pair> output_ptr(output);
+  value_t<pair> init{pair{4, 2}};
+
+  scan(input_ptr, output_ptr, num_items, op, init);
+
+  std::vector<pair> expected(num_items, {0, 0});
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value, [](const pair& lhs, const pair& rhs) {
+    return pair{short(lhs.a + rhs.a), lhs.b + rhs.b};
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<pair>(output_ptr));
+  }
+}
+
+struct counting_iterator_state_t
+{
+  int value;
+};
+
+TEST_CASE("Scan works with input iterators", "[scan]")
+{
+  const std::size_t num_items                         = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, counting_iterator_state_t> input_it = make_iterator<int, counting_iterator_state_t>(
+    "struct counting_iterator_state_t { int value; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->value += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 0;
+  pointer_t<int> output_it(num_items);
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  // vector storing a sequence of values 0, 1, 2, ..., num_items - 1
+  std::vector<int> input(num_items);
+  std::iota(input.begin(), input.end(), 0);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(output_it));
+  }
+}
+
+struct transform_output_iterator_state_t
+{
+  int* d_output;
+};
+
+TEST_CASE("Scan works with output iterators", "[scan]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_it(input);
+  pointer_t<int> inner_output_it(num_items);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value);
+
+  std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) {
+    return x * 2;
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(inner_output_it));
+  }
+}
+
+template <class T>
+struct constant_iterator_state_t
+{
+  T value;
+};
+
+TEST_CASE("Scan works with input and output iterators", "[scan]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
+    "struct constant_iterator_state_t { int value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value                                         = 1;
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"out_advance",
+     "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"out_dereference",
+     "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  pointer_t<int> inner_output_it(num_items);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init);
+
+  std::vector<int> expected(num_items, 1);
+  std::exclusive_scan(expected.begin(), expected.end(), expected.begin(), init.value);
+  std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) {
+    return x * 2;
+  });
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(inner_output_it));
+  }
+}
diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h
index 3f9a25dc43e..cf7fb2851a3 100644
--- a/c/parallel/test/test_util.h
+++ b/c/parallel/test/test_util.h
@@ -27,6 +27,7 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators_all.hpp>
 #include <cccl/c/reduce.h>
+#include <cccl/c/scan.h>
 #include <nvrtc.h>
 
 static std::string inspect_sass(const void* cubin, size_t cubin_size)
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index e605e4082f3..2c41f8f6036 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -540,6 +540,52 @@ struct tile_state_with_memory_order
     return tile_state.template LoadValid<Order>(tile_idx);
   }
 };
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(int num_tiles)
+{
+  return CUB_PTX_WARP_THREADS + num_tiles;
+}
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t
+tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles)
+{
+  int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
+  size_t allocation_sizes[]{
+    // bytes needed for tile status descriptors
+    static_cast<size_t>(num_tile_states * bytes_per_description),
+    // bytes needed for partials
+    static_cast<size_t>(num_tile_states * bytes_per_payload),
+    // bytes needed for inclusives
+    static_cast<size_t>(num_tile_states * bytes_per_payload)};
+  // Set the necessary size of the blob
+  size_t temp_storage_bytes = 0;
+  void* allocations[3]      = {};
+  AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes);
+
+  return temp_storage_bytes;
+};
+
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
+  int bytes_per_description,
+  int bytes_per_payload,
+  int num_tiles,
+  void* d_temp_storage,
+  size_t temp_storage_bytes,
+  void* (&allocations)[3])
+{
+  int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
+  size_t allocation_sizes[]{
+    // bytes needed for tile status descriptors
+    static_cast<size_t>(num_tile_states * bytes_per_description),
+    // bytes needed for partials
+    static_cast<size_t>(num_tile_states * bytes_per_payload),
+    // bytes needed for inclusives
+    static_cast<size_t>(num_tile_states * bytes_per_payload)};
+
+  // Set the necessary size of the blob
+  return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+}
+
 } // namespace detail
 
 /**
@@ -583,6 +629,9 @@ struct ScanTileState<T, true>
   // Device storage
   TxnWord* d_tile_descriptors;
 
+  static constexpr size_t description_bytes_per_tile = sizeof(TxnWord);
+  static constexpr size_t payload_bytes_per_tile     = 0;
+
   /// Constructor
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
       : d_tile_descriptors(nullptr)
@@ -618,10 +667,11 @@ struct ScanTileState<T, true>
    * @param[out] temp_storage_bytes
    *   Size in bytes of \t d_temp_storage allocation
    */
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
+  AllocationSize(int num_tiles, size_t& temp_storage_bytes)
   {
-    // bytes needed for tile status descriptors
-    temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
+    temp_storage_bytes =
+      detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
     return cudaSuccess;
   }
 
@@ -782,6 +832,9 @@ struct ScanTileState<T, false>
   T* d_tile_partial;
   T* d_tile_inclusive;
 
+  static constexpr size_t description_bytes_per_tile = sizeof(StatusWord);
+  static constexpr size_t payload_bytes_per_tile     = sizeof(Uninitialized<T>);
+
   /// Constructor
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
       : d_tile_status(nullptr)
@@ -810,25 +863,12 @@ struct ScanTileState<T, false>
     do
     {
       void* allocations[3] = {};
-      size_t allocation_sizes[3];
-
-      // bytes needed for tile status descriptors
-      allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
-
-      // bytes needed for partials
-      allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-      // bytes needed for inclusives
-      allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-      // Compute allocation pointers into the single storage blob
-      error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-
+      error                = detail::tile_state_init(
+        description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
       if (cudaSuccess != error)
       {
         break;
       }
-
       // Alias the offsets
       d_tile_status    = reinterpret_cast<StatusWord*>(allocations[0]);
       d_tile_partial   = reinterpret_cast<T*>(allocations[1]);
@@ -847,25 +887,13 @@ struct ScanTileState<T, false>
    * @param[out] temp_storage_bytes
    *   Size in bytes of \t d_temp_storage allocation
    */
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
+  AllocationSize(int num_tiles, size_t& temp_storage_bytes)
   {
-    // Specify storage allocation requirements
-    size_t allocation_sizes[3];
-
-    // bytes needed for tile status descriptors
-    allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
-
-    // bytes needed for partials
-    allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-    // bytes needed for inclusives
-    allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
-
-    // Set the necessary size of the blob
-    void* allocations[3] = {};
-    return CubDebug(detail::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes));
+    temp_storage_bytes =
+      detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
+    return cudaSuccess;
   }
-
   /**
    * Initialize (from device)
    */
diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh
index 52c643f9707..66e2e8a87a9 100644
--- a/cub/cub/detail/launcher/cuda_driver.cuh
+++ b/cub/cub/detail/launcher/cuda_driver.cuh
@@ -103,6 +103,11 @@ struct CudaDriverLauncherFactory
       cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_fn, block_size, dynamic_smem_bytes));
   }
 
+  _CCCL_HIDE_FROM_ABI cudaError_t MaxGridDimX(int& max_grid_dim_x) const
+  {
+    return static_cast<cudaError_t>(cuDeviceGetAttribute(&max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device));
+  }
+
   CUdevice device;
   int cc;
 };
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 18b4c4f61c7..85834b10a35 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -83,7 +83,7 @@ struct DeviceScanKernelSource
 {
   using ScanTileStateT = typename cub::ScanTileState<AccumT>;
 
-  CUB_DEFINE_KERNEL_GETTER(ScanInitKernel, DeviceScanInitKernel<ScanTileStateT>)
+  CUB_DEFINE_KERNEL_GETTER(InitKernel, DeviceScanInitKernel<ScanTileStateT>)
 
   CUB_DEFINE_KERNEL_GETTER(
     ScanKernel,
@@ -101,6 +101,11 @@ struct DeviceScanKernelSource
   {
     return sizeof(AccumT);
   }
+
+  CUB_RUNTIME_FUNCTION ScanTileStateT TileState()
+  {
+    return ScanTileStateT();
+  }
 };
 
 } // namespace detail::scan
@@ -165,9 +170,6 @@ struct DispatchScan
 
   static constexpr int INIT_KERNEL_THREADS = 128;
 
-  // The input value type
-  using InputT = cub::detail::value_t<InputIteratorT>;
-
   /// Device-accessible allocation of temporary storage.  When nullptr, the
   /// required allocation size is written to \p temp_storage_bytes and no work
   /// is done.
@@ -258,13 +260,9 @@ struct DispatchScan
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
   Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {})
   {
-    using ScanTileStateT = typename KernelSource::ScanTileStateT;
-
     // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
     // performance.
-    static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG,
-                  "The memory consistency model does not apply to texture "
-                  "accesses");
+    policy.CheckLoadModifier();
 
     cudaError error = cudaSuccess;
     do
@@ -281,9 +279,12 @@ struct DispatchScan
       int tile_size = policy.Scan().BlockThreads() * policy.Scan().ItemsPerThread();
       int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
 
+      auto tile_state = kernel_source.TileState();
+
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1];
-      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+
+      error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]));
       if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
@@ -313,7 +314,6 @@ struct DispatchScan
       }
 
       // Construct the tile status interface
-      ScanTileStateT tile_state;
       error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
       if (cudaSuccess != error)
       {
@@ -346,9 +346,10 @@ struct DispatchScan
 
       // Get SM occupancy for scan_kernel
       int scan_sm_occupancy;
-      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
-                                      scan_kernel,
-                                      policy.Scan().BlockThreads()));
+      error = CubDebug(launcher_factory.MaxSmOccupancy(
+        scan_sm_occupancy, // out
+        scan_kernel,
+        policy.Scan().BlockThreads()));
       if (cudaSuccess != error)
       {
         break;
@@ -397,17 +398,15 @@ struct DispatchScan
         }
       }
     } while (0);
-
     return error;
   }
 
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
   {
-    using ScanTileStateT = typename KernelSource::ScanTileStateT;
-    auto wrapped_policy  = detail::scan::MakeScanPolicyWrapper(active_policy);
+    auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy);
     // Ensure kernels are instantiated.
-    return Invoke(kernel_source.ScanInitKernel(), kernel_source.ScanKernel(), wrapped_policy);
+    return Invoke(kernel_source.InitKernel(), kernel_source.ScanKernel(), wrapped_policy);
   }
 
   /**
@@ -451,20 +450,20 @@ struct DispatchScan
     InitValueT init_value,
     OffsetT num_items,
     cudaStream_t stream,
-    KernelSource kernel_source = {},
-    MaxPolicyT max_policy      = {})
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {},
+    MaxPolicyT max_policy                  = {})
   {
     cudaError_t error;
     do
     {
       // Get PTX version
       int ptx_version = 0;
-      error           = CubDebug(PtxVersion(ptx_version));
+      error           = CubDebug(launcher_factory.PtxVersion(ptx_version));
       if (cudaSuccess != error)
       {
         break;
       }
-
       // Create dispatch functor
       DispatchScan dispatch(
         d_temp_storage,
@@ -476,7 +475,8 @@ struct DispatchScan
         init_value,
         stream,
         ptx_version,
-        kernel_source);
+        kernel_source,
+        launcher_factory);
 
       // Dispatch to chained policy
       error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh
index cc3034638bc..b5b89d68756 100644
--- a/cub/cub/device/dispatch/kernels/scan.cuh
+++ b/cub/cub/device/dispatch/kernels/scan.cuh
@@ -40,6 +40,8 @@
 #include <cub/agent/agent_scan.cuh>
 #include <cub/util_macro.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -159,7 +161,8 @@ template <typename ChainedPolicyT,
           typename InitValueT,
           typename OffsetT,
           typename AccumT,
-          bool ForceInclusive>
+          bool ForceInclusive,
+          typename RealInitValueT = typename InitValueT::value_type>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     InputIteratorT d_in,
@@ -170,8 +173,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
     InitValueT init_value,
     OffsetT num_items)
 {
-  using RealInitValueT = typename InitValueT::value_type;
-  using ScanPolicyT    = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
+  using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
 
   // Thread block type for scanning input tiles
   using AgentScanT = detail::scan::
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 81c462bf7fa..1d4969714e8 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -433,10 +433,17 @@ struct ScanPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(StaticPolic
     return cub::detail::MakePolicyWrapper(typename StaticPolicyT::ScanPolicyT());
   }
 
-  CUB_RUNTIME_FUNCTION constexpr CacheLoadModifier LoadModifier()
+  CUB_RUNTIME_FUNCTION static constexpr CacheLoadModifier LoadModifier()
   {
     return StaticPolicyT::ScanPolicyT::LOAD_MODIFIER;
   }
+
+  CUB_RUNTIME_FUNCTION constexpr void CheckLoadModifier()
+  {
+    static_assert(LoadModifier() != CacheLoadModifier::LOAD_LDG,
+                  "The memory consistency model does not apply to texture "
+                  "accesses");
+  }
 };
 
 template <typename PolicyT>
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index 0fb29526cad..d4859c2c174 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 //-----------------------------------------------------------------------------