diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 96d6ee31fb2..606a6151dc7 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -74,28 +74,24 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
   asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory");
 }
 
-template <int BlockDim, typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(const T* addr, int tile_size)
+template <int BlockDim, typename It>
+_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int tile_size)
 {
-  constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to be
-                                            // architecture dependent?
-  const int tile_size_bytes = tile_size * sizeof(T);
-  // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
-#pragma unroll 1
-  for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes;
-       offset += BlockDim * prefetch_byte_stride)
+  if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<It>)
   {
-    prefetch(reinterpret_cast<const char*>(addr) + offset);
+    constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to
+                                              // be architecture dependent?
+    const int tile_size_bytes = tile_size * sizeof(value_t<It>);
+    // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
+#pragma unroll 1
+    for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes;
+         offset += BlockDim * prefetch_byte_stride)
+    {
+      prefetch(reinterpret_cast<const char*>(::cuda::std::to_address(begin)) + offset);
+    }
   }
 }
 
-// TODO(miscco): we should probably constrain It to not be a contiguous iterator in C++17 (and change the overload
-// above to accept any contiguous iterator)
-// overload for any iterator that is not a pointer, do nothing
-template <int, typename It, ::cuda::std::enable_if_t<!::cuda::std::is_pointer_v<It>, int> = 0>
-_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It, int)
-{}
-
 // This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
 // global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
 // parameter yields a global memory address.
@@ -119,20 +115,13 @@ _CCCL_DEVICE void transform_kernel_impl(
 
   // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
   {
-    int dummy[] = {(ins += offset, 0)..., 0};
-    (void) &dummy;
+    (..., (ins += offset));
     out += offset;
   }
 
-  {
-    // TODO(bgruber): replace by fold over comma in C++17
-    // extra zero at the end handles empty packs
-    int dummy[] = {(prefetch_tile<block_dim>(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size), 0)..., 0};
-    (void) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
-  }
+  (..., prefetch_tile<block_dim>(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size));
 
-  // TODO(bgruber): use `auto full_tile` and pass true_type/false_type in C++14 to strengthen the compile-time intent
-  auto process_tile = [&](bool full_tile) {
+  auto process_tile = [&](auto full_tile, auto... ins2 /* nvcc fails to compile when just using the captured ins */) {
     // ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost
     // bgruber: but A6000 and H100 show small gains without pragma
     //_Pragma("unroll 1")
@@ -142,41 +131,20 @@ _CCCL_DEVICE void transform_kernel_impl(
       if (full_tile || idx < tile_size)
       {
         // we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test)
-        out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...);
+        out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins2[idx])...);
       }
     }
   };
-  // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
   if (tile_stride == tile_size)
   {
-    process_tile(true);
+    process_tile(::cuda::std::true_type{}, ins...);
   }
   else
   {
-    process_tile(false);
+    process_tile(::cuda::std::false_type{}, ins...);
   }
 }
 
-// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17.
-template <class F, class Tuple, size_t... Is>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence<Is...>)
-  -> decltype(::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...))
-{
-  return ::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...);
-}
-
-template <class F, class Tuple>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl(
-  ::cuda::std::forward<F>(f),
-  ::cuda::std::forward<Tuple>(t),
-  ::cuda::std::make_index_sequence<::cuda::std::tuple_size_v<::cuda::std::remove_reference_t<Tuple>>>{}))
-{
-  return poor_apply_impl(
-    ::cuda::std::forward<F>(f),
-    ::cuda::std::forward<Tuple>(t),
-    ::cuda::std::make_index_sequence<::cuda::std::tuple_size_v<::cuda::std::remove_reference_t<Tuple>>>{});
-}
-
 // Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
 //
 // For performance considerations of memcpy_async:
@@ -254,35 +222,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
   return threadIdx.x < 32 && static_cast<bool>(is_elected);
 }
 
-// TODO(bgruber): inline this as lambda in C++14
-template <typename Offset, typename T>
-_CCCL_DEVICE void bulk_copy_tile(
-  ::cuda::std::uint64_t& bar,
-  int tile_stride,
-  char* smem,
-  int& smem_offset,
-  ::cuda::std::uint32_t& total_bytes_bulk_copied,
-  Offset global_offset,
-  const aligned_base_ptr<T>& aligned_ptr)
-{
-  static_assert(alignof(T) <= bulk_copy_alignment, "");
-
-  const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
-  char* dst       = smem + smem_offset;
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
-
-  // TODO(bgruber): we could precompute bytes_to_copy on the host
-  const int bytes_to_copy = round_up_to_po2_multiple(
-    aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
-
-  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
-  total_bytes_bulk_copied += bytes_to_copy;
-
-  // add bulk_copy_alignment to make space for the next tile's head padding
-  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
-}
-
 template <typename Offset, typename T>
 _CCCL_DEVICE void bulk_copy_tile_fallback(
   int tile_size,
@@ -304,16 +243,6 @@ _CCCL_DEVICE void bulk_copy_tile_fallback(
   smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
 }
 
-// TODO(bgruber): inline this as lambda in C++14
-template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE const T&
-fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr<T>& aligned_ptr)
-{
-  const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
-  smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
-  return smem_operand_tile_base[smem_idx];
-}
-
 template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
 _CCCL_DEVICE void transform_kernel_ublkcp(
   Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
@@ -340,11 +269,28 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
       int smem_offset                    = 0;
       ::cuda::std::uint32_t total_copied = 0;
 
-      // TODO(bgruber): use a fold over comma in C++17
+      auto bulk_copy_tile = [&](auto aligned_ptr) {
+        using T = typename decltype(aligned_ptr)::value_type;
+        static_assert(alignof(T) <= bulk_copy_alignment, "");
+
+        const char* src = aligned_ptr.ptr + offset * sizeof(T);
+        char* dst       = smem + smem_offset;
+        _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+        _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+
+        // TODO(bgruber): we could precompute bytes_to_copy on the host
+        const int bytes_to_copy = round_up_to_po2_multiple(
+          aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
+
+        ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
+        total_copied += bytes_to_copy;
+
+        // add bulk_copy_alignment to make space for the next tile's head padding
+        smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+      };
+
       // Order of evaluation is left-to-right
-      int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)...,
-                     0};
-      (void) dummy;
+      (..., bulk_copy_tile(aligned_ptrs));
 
       // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
       ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
@@ -360,10 +306,22 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
     // use all threads to schedule an async_memcpy
     int smem_offset = 0;
 
-    // TODO(bgruber): use a fold over comma in C++17
+    auto bulk_copy_tile_fallback = [&](auto aligned_ptr) {
+      using T      = typename decltype(aligned_ptr)::value_type;
+      const T* src = aligned_ptr.ptr_to_elements() + offset;
+      T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
+      _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+      _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+
+      const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
+      cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
+
+      // add bulk_copy_alignment to make space for the next tile's head padding
+      smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+    };
+
     // Order of evaluation is left-to-right
-    int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0};
-    (void) dummy;
+    (..., bulk_copy_tile_fallback(aligned_ptrs));
 
     cooperative_groups::wait(cooperative_groups::this_thread_block());
   }
@@ -371,8 +329,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
   out += offset;
 
-  // TODO(bgruber): use `auto full_tile` and pass true_type/false_type in C++14 to strengthen the compile-time intent
-  auto process_tile = [&](bool full_tile) {
+  auto process_tile = [&](auto full_tile) {
     // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
     _CCCL_PRAGMA(unroll 1)
     for (int j = 0; j < num_elem_per_thread; ++j)
@@ -380,24 +337,31 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
       const int idx = j * block_dim + threadIdx.x;
       if (full_tile || idx < tile_size)
       {
-        int smem_offset = 0;
+        int smem_offset    = 0;
+        auto fetch_operand = [&](auto aligned_ptr) {
+          using T                         = typename decltype(aligned_ptr)::value_type;
+          const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
+          smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
+          return smem_operand_tile_base[idx];
+        };
+
         // need to expand into a tuple for guaranteed order of evaluation
-        out[idx] = poor_apply(
-          [&](const InTs&... values) {
+        out[idx] = ::cuda::std::apply(
+          [&](auto... values) {
             return f(values...);
           },
-          ::cuda::std::tuple<InTs...>{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...});
+          ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
       }
     }
   };
   // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
   if (tile_stride == tile_size)
   {
-    process_tile(true);
+    process_tile(::cuda::std::true_type{});
   }
   else
   {
-    process_tile(false);
+    process_tile(::cuda::std::false_type{});
   }
 }
 
@@ -458,30 +422,26 @@ _CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -
   return arg;
 }
 
-// TODO(bgruber): make a variable template in C++14
 template <Algorithm Alg>
-using needs_aligned_ptr_t =
-  ::cuda::std::bool_constant<false
+_CCCL_INLINE_VAR constexpr bool needs_aligned_ptr_v =
+  false
 #ifdef _CUB_HAS_TRANSFORM_UBLKCP
-                             || Alg == Algorithm::ublkcp
+  || Alg == Algorithm::ublkcp
 #endif // _CUB_HAS_TRANSFORM_UBLKCP
-                             >;
+  ;
 
-#ifdef _CUB_HAS_TRANSFORM_UBLKCP
-template <Algorithm Alg, typename It, ::cuda::std::enable_if_t<needs_aligned_ptr_t<Alg>::value, int> = 0>
+template <Algorithm Alg, typename It>
 _CCCL_DEVICE _CCCL_FORCEINLINE auto
 select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg)
-  -> aligned_base_ptr<value_t<It>>&&
 {
-  return ::cuda::std::move(arg.aligned_ptr);
-}
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  if constexpr (needs_aligned_ptr_v<Alg>)
+  {
+    return ::cuda::std::move(arg.aligned_ptr);
+  }
+  else
 #endif // _CUB_HAS_TRANSFORM_UBLKCP
-
-template <Algorithm Alg, typename It, ::cuda::std::enable_if_t<!needs_aligned_ptr_t<Alg>::value, int> = 0>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto
-select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> It&&
-{
-  return ::cuda::std::move(arg.iterator);
+    return ::cuda::std::move(arg.iterator);
 }
 
 // There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
@@ -510,57 +470,11 @@ __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
     select_kernel_arg(alg, ::cuda::std::move(ins))...);
 }
 
-// TODO(bgruber): replace by ::cuda::std::expected in C++14
 template <typename T>
-struct PoorExpected
-{
-  alignas(T) char storage[sizeof(T)];
-  cudaError_t error;
-
-  _CCCL_HOST_DEVICE PoorExpected(T value)
-      : error(cudaSuccess)
-  {
-    new (storage) T(::cuda::std::move(value));
-  }
-
-  _CCCL_HOST_DEVICE PoorExpected(cudaError_t error)
-      : error(error)
-  {}
-
-  _CCCL_HOST_DEVICE explicit operator bool() const
-  {
-    return error == cudaSuccess;
-  }
-
-  _CCCL_HOST_DEVICE T& operator*()
-  {
-    _CCCL_DIAG_PUSH
-    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
-    return reinterpret_cast<T&>(storage);
-    _CCCL_DIAG_POP
-  }
-
-  _CCCL_HOST_DEVICE const T& operator*() const
-  {
-    _CCCL_DIAG_PUSH
-    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
-    return reinterpret_cast<const T&>(storage);
-    _CCCL_DIAG_POP
-  }
-
-  _CCCL_HOST_DEVICE T* operator->()
-  {
-    return &**this;
-  }
-
-  _CCCL_HOST_DEVICE const T* operator->() const
-  {
-    return &**this;
-  }
-};
+using cuda_expected = ::cuda::std::expected<T, cudaError_t>;
 
 // TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this.
-_CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
+_CCCL_HOST_DEVICE inline cuda_expected<int> get_max_shared_memory()
 {
   //  gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels
   //  will later be launched on) belongs to the currently active device. So we can just query the active device here.
@@ -581,7 +495,7 @@ _CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
   return max_smem;
 }
 
-_CCCL_HOST_DEVICE inline PoorExpected<int> get_sm_count()
+_CCCL_HOST_DEVICE inline cuda_expected<int> get_sm_count()
 {
   int device = 0;
   auto error = CubDebug(cudaGetDevice(&device));
@@ -657,7 +571,7 @@ struct dispatch_t<StableAddress,
   // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running
   // on?
   template <typename ActivePolicy>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected<
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> cuda_expected<
     ::cuda::std::
       tuple<THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
   {
@@ -667,11 +581,11 @@ struct dispatch_t<StableAddress,
                   "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple
                                                                                         // of 128-byte
 
-    auto determine_element_counts = [&]() -> PoorExpected<elem_counts> {
+    auto determine_element_counts = [&]() -> cuda_expected<elem_counts> {
       const auto max_smem = get_max_shared_memory();
       if (!max_smem)
       {
-        return max_smem.error;
+        return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(max_smem.error());
       }
 
       elem_counts last_counts{};
@@ -700,7 +614,7 @@ struct dispatch_t<StableAddress,
           CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size));
         if (error != cudaSuccess)
         {
-          return error;
+          return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 with GCC 7 fails CTAD here */>(error);
         }
 
         const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter;
@@ -713,7 +627,7 @@ struct dispatch_t<StableAddress,
       }
       return last_counts;
     };
-    PoorExpected<elem_counts> config = [&]() {
+    cuda_expected<elem_counts> config = [&]() {
       NV_IF_TARGET(NV_IS_HOST,
                    (static auto cached_config = determine_element_counts(); return cached_config;),
                    (
@@ -722,7 +636,7 @@ struct dispatch_t<StableAddress,
     }();
     if (!config)
     {
-      return config.error;
+      return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 with GCC 7 fails CTAD here */>(config.error());
     }
     _CCCL_ASSERT_HOST(config->elem_per_thread > 0, "");
     _CCCL_ASSERT_HOST(config->tile_size > 0, "");
@@ -743,15 +657,13 @@ struct dispatch_t<StableAddress,
     auto ret = configure_ublkcp_kernel<ActivePolicy>();
     if (!ret)
     {
-      return ret.error;
+      return ret.error();
     }
-    // TODO(bgruber): use a structured binding in C++17
-    // auto [launcher, kernel, elem_per_thread] = *ret;
-
-    return ::cuda::std::get<0>(*ret).doit(
-      ::cuda::std::get<1>(*ret),
+    auto [launcher, kernel, elem_per_thread] = *ret;
+    return launcher.doit(
+      kernel,
       num_items,
-      ::cuda::std::get<2>(*ret),
+      elem_per_thread,
       op,
       out,
       make_aligned_base_ptr_kernel_arg(
@@ -766,22 +678,22 @@ struct dispatch_t<StableAddress,
     using policy_t          = typename ActivePolicy::algo_policy;
     constexpr int block_dim = policy_t::block_threads;
 
-    auto determine_config = [&]() -> PoorExpected<prefetch_config> {
+    auto determine_config = [&]() -> cuda_expected<prefetch_config> {
       int max_occupancy = 0;
       const auto error  = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
       if (error != cudaSuccess)
       {
-        return error;
+        return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
       }
       const auto sm_count = get_sm_count();
       if (!sm_count)
       {
-        return sm_count.error;
+        return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(sm_count.error());
       }
       return prefetch_config{max_occupancy, *sm_count};
     };
 
-    PoorExpected<prefetch_config> config = [&]() {
+    cuda_expected<prefetch_config> config = [&]() {
       NV_IF_TARGET(
         NV_IS_HOST,
         (
@@ -794,7 +706,7 @@ struct dispatch_t<StableAddress,
     }();
     if (!config)
     {
-      return config.error;
+      return config.error();
     }
 
     // choose items per thread to reach minimum bytes in flight