diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index 4c74b73baf2..bf4984f7256 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -43,10 +43,11 @@
 #include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
 
-#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/core/load_iterator.h>
 
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__cccl/cuda_capabilities.h>
 
 CUB_NAMESPACE_BEGIN
 
@@ -86,7 +87,7 @@ struct AgentBlockSort
   // Types and constants
   //---------------------------------------------------------------------
 
-  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+  static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
 
   using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
 
@@ -469,7 +470,7 @@ struct AgentMerge
   struct TempStorage : Uninitialized<_TempStorage>
   {};
 
-  static constexpr bool KEYS_ONLY       = std::is_same<ValueT, NullType>::value;
+  static constexpr bool KEYS_ONLY       = ::cuda::std::is_same_v<ValueT, NullType>;
   static constexpr int BLOCK_THREADS    = Policy::BLOCK_THREADS;
   static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
   static constexpr int ITEMS_PER_TILE   = Policy::ITEMS_PER_TILE;
diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index 9b51a539303..c618e2e83a4 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -47,6 +47,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/cuda/detail/core/util.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #if __cccl_lib_mdspan
 #  include <cuda/std/__mdspan/extents.h>
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index e8cc91e8420..056522e162d 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -38,6 +38,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_merge_sort.cuh>
+#include <cub/device/dispatch/kernels/merge_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_merge_sort.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
@@ -53,277 +54,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail::merge_sort
-{
-
-/**
- * @brief Helper class template that provides two agent template instantiations: one instantiated with the default
- * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters
- * twice: once for the default agent and once for the fallback agent
- */
-template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
-struct dual_policy_agent_helper_t
-{
-  using default_agent_t  = AgentT<DefaultPolicyT, AgentParamsT...>;
-  using fallback_agent_t = AgentT<FallbackPolicyT, AgentParamsT...>;
-
-  static constexpr auto default_size  = sizeof(typename default_agent_t::TempStorage);
-  static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage);
-};
-
-/**
- * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its
- * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile
- * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is
- * applicable: we must either use the fallback for both or for none of the two agents.
- */
-template <typename DefaultPolicyT,
-          typename KeyInputIteratorT,
-          typename ValueInputIteratorT,
-          typename KeyIteratorT,
-          typename ValueIteratorT,
-          typename OffsetT,
-          typename CompareOpT,
-          typename KeyT,
-          typename ValueT>
-class merge_sort_vsmem_helper_t
-{
-private:
-  // Default fallback policy with a smaller tile size
-  using fallback_policy_t = cub::detail::policy_wrapper_t<DefaultPolicyT, 64, 1>;
-
-  // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the
-  // default policy and one instantiated with the fallback policy
-  using block_sort_helper_t = dual_policy_agent_helper_t<
-    DefaultPolicyT,
-    fallback_policy_t,
-    merge_sort::AgentBlockSort,
-    KeyInputIteratorT,
-    ValueInputIteratorT,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>;
-  using default_block_sort_agent_t  = typename block_sort_helper_t::default_agent_t;
-  using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t;
-
-  // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the
-  // default policy and one instantiated with the fallback policy
-  using merge_helper_t = dual_policy_agent_helper_t<
-    DefaultPolicyT,
-    fallback_policy_t,
-    merge_sort::AgentMerge,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>;
-  using default_merge_agent_t  = typename merge_helper_t::default_agent_t;
-  using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t;
-
-  // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
-  // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
-  // available shared memory
-  static constexpr auto max_default_size =
-    (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
-  static constexpr auto max_fallback_size =
-    (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
-  static constexpr bool uses_fallback_policy =
-    (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
-
-public:
-  using policy_t = ::cuda::std::_If<uses_fallback_policy, fallback_policy_t, DefaultPolicyT>;
-  using block_sort_agent_t =
-    ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
-  using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
-};
-template <typename ChainedPolicyT,
-          typename KeyInputIteratorT,
-          typename ValueInputIteratorT,
-          typename KeyIteratorT,
-          typename ValueIteratorT,
-          typename OffsetT,
-          typename CompareOpT,
-          typename KeyT,
-          typename ValueT>
-__launch_bounds__(
-  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-                            KeyInputIteratorT,
-                            ValueInputIteratorT,
-                            KeyIteratorT,
-                            ValueIteratorT,
-                            OffsetT,
-                            CompareOpT,
-                            KeyT,
-                            ValueT>::policy_t::BLOCK_THREADS)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
-    bool ping,
-    KeyInputIteratorT keys_in,
-    ValueInputIteratorT items_in,
-    KeyIteratorT keys_out,
-    ValueIteratorT items_out,
-    OffsetT keys_count,
-    KeyT* tmp_keys_out,
-    ValueT* tmp_items_out,
-    CompareOpT compare_op,
-    vsmem_t vsmem)
-{
-  using MergeSortHelperT = merge_sort_vsmem_helper_t<
-    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-    KeyInputIteratorT,
-    ValueInputIteratorT,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>;
-
-  using ActivePolicyT = typename MergeSortHelperT::policy_t;
-
-  using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t;
-
-  using VSmemHelperT = vsmem_helper_impl<AgentBlockSortT>;
-
-  // Static shared memory allocation
-  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
-
-  // Get temporary storage
-  typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
-
-  AgentBlockSortT agent(
-    ping,
-    temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
-    keys_count,
-    keys_out,
-    items_out,
-    tmp_keys_out,
-    tmp_items_out,
-    compare_op);
-
-  agent.Process();
-
-  // If applicable, hints to discard modified cache lines for vsmem
-  VSmemHelperT::discard_temp_storage(temp_storage);
-}
-
-// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with
-// cudaErrorIllegalAddress.
-template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
-  bool ping,
-  KeyIteratorT keys_ping,
-  KeyT* keys_pong,
-  OffsetT keys_count,
-  OffsetT num_partitions,
-  OffsetT* merge_partitions,
-  CompareOpT compare_op,
-  OffsetT target_merged_tiles_number,
-  int items_per_tile)
-{
-  OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (partition_idx < num_partitions)
-  {
-    AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT> agent(
-      ping,
-      keys_ping,
-      keys_pong,
-      keys_count,
-      partition_idx,
-      merge_partitions,
-      compare_op,
-      target_merged_tiles_number,
-      items_per_tile,
-      num_partitions);
-
-    agent.Process();
-  }
-}
-
-template <typename ChainedPolicyT,
-          typename KeyInputIteratorT,
-          typename ValueInputIteratorT,
-          typename KeyIteratorT,
-          typename ValueIteratorT,
-          typename OffsetT,
-          typename CompareOpT,
-          typename KeyT,
-          typename ValueT>
-__launch_bounds__(
-  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-                            KeyInputIteratorT,
-                            ValueInputIteratorT,
-                            KeyIteratorT,
-                            ValueIteratorT,
-                            OffsetT,
-                            CompareOpT,
-                            KeyT,
-                            ValueT>::policy_t::BLOCK_THREADS)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
-    bool ping,
-    KeyIteratorT keys_ping,
-    ValueIteratorT items_ping,
-    OffsetT keys_count,
-    KeyT* keys_pong,
-    ValueT* items_pong,
-    CompareOpT compare_op,
-    OffsetT* merge_partitions,
-    OffsetT target_merged_tiles_number,
-    vsmem_t vsmem)
-{
-  using MergeSortHelperT = merge_sort_vsmem_helper_t<
-    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-    KeyInputIteratorT,
-    ValueInputIteratorT,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>;
-
-  using ActivePolicyT = typename MergeSortHelperT::policy_t;
-
-  using AgentMergeT = typename MergeSortHelperT::merge_agent_t;
-
-  using VSmemHelperT = vsmem_helper_impl<AgentMergeT>;
-
-  // Static shared memory allocation
-  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
-
-  // Get temporary storage
-  typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
-
-  AgentMergeT agent(
-    ping,
-    temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
-    keys_count,
-    keys_pong,
-    items_pong,
-    keys_ping,
-    items_ping,
-    compare_op,
-    merge_partitions,
-    target_merged_tiles_number);
-
-  agent.Process();
-
-  // If applicable, hints to discard modified cache lines for vsmem
-  VSmemHelperT::discard_temp_storage(temp_storage);
-}
-
-} // namespace detail::merge_sort
-
 /*******************************************************************************
  * Policy
  ******************************************************************************/
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index f35e89a133f..ce8b1aaff66 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -31,6 +31,7 @@ _CCCL_NV_DIAG_SUPPRESS(186)
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #include <cuda/cmath>
 #include <cuda/ptx>
diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh
new file mode 100644
index 00000000000..1065313c20d
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh
@@ -0,0 +1,304 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge_sort.cuh>
+#include <cub/util_policy_wrapper_t.cuh>
+#include <cub/util_vsmem.cuh>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub::core
+{
+// We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code
+template <class PtxPlan, class It>
+typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it);
+} // namespace cuda_cub::core
+
+THRUST_NAMESPACE_END
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::merge_sort
+{
+
+/**
+ * @brief Helper class template that provides two agent template instantiations: one instantiated with the default
+ * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters
+ * twice: once for the default agent and once for the fallback agent
+ */
+template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+struct dual_policy_agent_helper_t
+{
+  using default_agent_t  = AgentT<DefaultPolicyT, AgentParamsT...>;
+  using fallback_agent_t = AgentT<FallbackPolicyT, AgentParamsT...>;
+
+  static constexpr auto default_size  = sizeof(typename default_agent_t::TempStorage);
+  static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage);
+};
+
+/**
+ * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its
+ * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile
+ * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is
+ * applicable: we must either use the fallback for both or for none of the two agents.
+ */
+template <typename DefaultPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+class merge_sort_vsmem_helper_t
+{
+private:
+  // Default fallback policy with a smaller tile size
+  using fallback_policy_t = cub::detail::policy_wrapper_t<DefaultPolicyT, 64, 1>;
+
+  // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the
+  // default policy and one instantiated with the fallback policy
+  using block_sort_helper_t = dual_policy_agent_helper_t<
+    DefaultPolicyT,
+    fallback_policy_t,
+    merge_sort::AgentBlockSort,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+  using default_block_sort_agent_t  = typename block_sort_helper_t::default_agent_t;
+  using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t;
+
+  // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the
+  // default policy and one instantiated with the fallback policy
+  using merge_helper_t = dual_policy_agent_helper_t<
+    DefaultPolicyT,
+    fallback_policy_t,
+    merge_sort::AgentMerge,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+  using default_merge_agent_t  = typename merge_helper_t::default_agent_t;
+  using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t;
+
+  // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
+  // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
+  // available shared memory
+  static constexpr auto max_default_size =
+    (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
+  static constexpr auto max_fallback_size =
+    (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
+  static constexpr bool uses_fallback_policy =
+    (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
+
+public:
+  using policy_t = ::cuda::std::_If<uses_fallback_policy, fallback_policy_t, DefaultPolicyT>;
+  using block_sort_agent_t =
+    ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
+  using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
+};
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+__launch_bounds__(
+  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+                            KeyInputIteratorT,
+                            ValueInputIteratorT,
+                            KeyIteratorT,
+                            ValueIteratorT,
+                            OffsetT,
+                            CompareOpT,
+                            KeyT,
+                            ValueT>::policy_t::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
+    bool ping,
+    KeyInputIteratorT keys_in,
+    ValueInputIteratorT items_in,
+    KeyIteratorT keys_out,
+    ValueIteratorT items_out,
+    OffsetT keys_count,
+    KeyT* tmp_keys_out,
+    ValueT* tmp_items_out,
+    CompareOpT compare_op,
+    vsmem_t vsmem)
+{
+  using MergeSortHelperT = merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+
+  using ActivePolicyT = typename MergeSortHelperT::policy_t;
+
+  using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t;
+
+  using VSmemHelperT = vsmem_helper_impl<AgentBlockSortT>;
+
+  // Static shared memory allocation
+  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
+
+  AgentBlockSortT agent(
+    ping,
+    temp_storage,
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
+    keys_count,
+    keys_out,
+    items_out,
+    tmp_keys_out,
+    tmp_items_out,
+    compare_op);
+
+  agent.Process();
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VSmemHelperT::discard_temp_storage(temp_storage);
+}
+
+// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with
+// cudaErrorIllegalAddress.
+template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
+  bool ping,
+  KeyIteratorT keys_ping,
+  KeyT* keys_pong,
+  OffsetT keys_count,
+  OffsetT num_partitions,
+  OffsetT* merge_partitions,
+  CompareOpT compare_op,
+  OffsetT target_merged_tiles_number,
+  int items_per_tile)
+{
+  OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (partition_idx < num_partitions)
+  {
+    AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT> agent(
+      ping,
+      keys_ping,
+      keys_pong,
+      keys_count,
+      partition_idx,
+      merge_partitions,
+      compare_op,
+      target_merged_tiles_number,
+      items_per_tile,
+      num_partitions);
+
+    agent.Process();
+  }
+}
+
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+__launch_bounds__(
+  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+                            KeyInputIteratorT,
+                            ValueInputIteratorT,
+                            KeyIteratorT,
+                            ValueIteratorT,
+                            OffsetT,
+                            CompareOpT,
+                            KeyT,
+                            ValueT>::policy_t::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
+    bool ping,
+    KeyIteratorT keys_ping,
+    ValueIteratorT items_ping,
+    OffsetT keys_count,
+    KeyT* keys_pong,
+    ValueT* items_pong,
+    CompareOpT compare_op,
+    OffsetT* merge_partitions,
+    OffsetT target_merged_tiles_number,
+    vsmem_t vsmem)
+{
+  using MergeSortHelperT = merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+
+  using ActivePolicyT = typename MergeSortHelperT::policy_t;
+
+  using AgentMergeT = typename MergeSortHelperT::merge_agent_t;
+
+  using VSmemHelperT = vsmem_helper_impl<AgentMergeT>;
+
+  // Static shared memory allocation
+  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
+
+  AgentMergeT agent(
+    ping,
+    temp_storage,
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    keys_count,
+    keys_pong,
+    items_pong,
+    keys_ping,
+    items_ping,
+    compare_op,
+    merge_partitions,
+    target_merged_tiles_number);
+
+  agent.Process();
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VSmemHelperT::discard_temp_storage(temp_storage);
+}
+
+} // namespace detail::merge_sort
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index add033fd030..498f17c1259 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -48,6 +48,7 @@
 #endif // no system header
 
 #include <cub/util_debug.cuh>
+#include <cub/util_policy_wrapper_t.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
@@ -68,19 +69,6 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
-/**
- * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD`
- * configurations of a given policy.
- */
-// TODO(bgruber): this should be called something like "override_policy"
-template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
-struct policy_wrapper_t : PolicyT
-{
-  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_;
-  static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
-  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
-};
-
 /**
  * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
  */
diff --git a/cub/cub/util_policy_wrapper_t.cuh b/cub/cub/util_policy_wrapper_t.cuh
new file mode 100644
index 00000000000..0c74f1f4d22
--- /dev/null
+++ b/cub/cub/util_policy_wrapper_t.cuh
@@ -0,0 +1,55 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
+
+namespace detail
+{
+/**
+ * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD`
+ * configurations of a given policy.
+ */
+// TODO(bgruber): this should be called something like "override_policy"
+template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
+struct policy_wrapper_t : PolicyT
+{
+  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_;
+  static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+};
+} // namespace detail
+
+#endif // _CCCL_DOXYGEN_INVOKED
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index baba489c0ae..20228b44f69 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -43,15 +43,13 @@
 #endif // no system header
 
 #include <cub/util_arch.cuh>
-#include <cub/util_device.cuh>
+#include <cub/util_policy_wrapper_t.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
 #include <cuda/discard_memory>
 #include <cuda/std/type_traits>
 
-#include <cstdint>
-
 CUB_NAMESPACE_BEGIN
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -79,16 +77,16 @@ class vsmem_helper_impl
 {
 private:
   // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size`
-  static constexpr std::size_t line_size = 128;
+  static constexpr ::cuda::std::size_t line_size = 128;
 
   // The amount of shared memory or virtual shared memory required by the algorithm's agent
-  static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage);
+  static constexpr ::cuda::std::size_t required_smem = sizeof(typename AgentT::TempStorage);
 
   // Whether we need to allocate global memory-backed virtual shared memory
   static constexpr bool needs_vsmem = required_smem > max_smem_per_block;
 
   // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory
-  static constexpr std::size_t padding_bytes =
+  static constexpr ::cuda::std::size_t padding_bytes =
     (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size));
 
 public:
@@ -96,7 +94,7 @@ public:
   using static_temp_storage_t = ::cuda::std::_If<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
 
   // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes
-  static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
+  static constexpr ::cuda::std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
 
   /**
    * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
@@ -115,7 +113,7 @@ public:
    * storage and taking a linear block id.
    */
   static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
-  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t)
+  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, ::cuda::std::size_t)
   {
     return static_temp_storage;
   }
@@ -138,7 +136,7 @@ public:
    * virtual shared memory as temporary storage and taking a linear block id.
    */
   static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
-  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id)
+  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, ::cuda::std::size_t linear_block_id)
   {
     return *reinterpret_cast<typename AgentT::TempStorage*>(
       static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id));
@@ -170,8 +168,8 @@ public:
     // Ensure all threads finished using temporary storage
     __syncthreads();
 
-    const std::size_t linear_tid   = threadIdx.x;
-    const std::size_t block_stride = line_size * blockDim.x;
+    const ::cuda::std::size_t linear_tid   = threadIdx.x;
+    const ::cuda::std::size_t block_stride = line_size * blockDim.x;
 
     char* ptr    = reinterpret_cast<char*>(&temp_storage);
     auto ptr_end = ptr + vsmem_per_block;
@@ -186,7 +184,7 @@ public:
 };
 
 template <class DefaultAgentT, class FallbackAgentT>
-constexpr bool use_fallback_agent()
+_CCCL_HOST_DEVICE constexpr bool use_fallback_agent()
 {
   return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block)
       && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block);
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index 91ddc155311..e16b6195c9f 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -58,6 +58,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
     #include <cub/device/dispatch/kernels/reduce.cuh>
     #include <cub/device/dispatch/kernels/for_each.cuh>
     #include <cub/device/dispatch/kernels/scan.cuh>
+    #include <cub/device/dispatch/kernels/merge_sort.cuh>
 
     extern "C" __global__ void kernel(int *ptr, int *errors)
     {
diff --git a/thrust/testing/is_contiguous_iterator.cu b/thrust/testing/is_contiguous_iterator.cu
index 4d6f0d0ea62..d0fc6a8cfc7 100644
--- a/thrust/testing/is_contiguous_iterator.cu
+++ b/thrust/testing/is_contiguous_iterator.cu
@@ -15,6 +15,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #include <deque>
 #include <list>
diff --git a/thrust/thrust/detail/type_traits/has_nested_type.h b/thrust/thrust/detail/type_traits/has_nested_type.h
index b9af8747377..3146c794ef6 100644
--- a/thrust/thrust/detail/type_traits/has_nested_type.h
+++ b/thrust/thrust/detail/type_traits/has_nested_type.h
@@ -35,9 +35,9 @@
     using yes_type = char;                                                    \
     using no_type  = int;                                                     \
     template <typename S>                                                     \
-    static yes_type test(typename S::nested_type_name*);                      \
+    _CCCL_HOST_DEVICE static yes_type test(typename S::nested_type_name*);    \
     template <typename S>                                                     \
-    static no_type test(...);                                                 \
+    _CCCL_HOST_DEVICE static no_type test(...);                               \
     static bool const value = sizeof(test<T>(0)) == sizeof(yes_type);         \
     using type              = thrust::detail::integral_constant<bool, value>; \
   };
diff --git a/thrust/thrust/detail/type_traits/is_thrust_pointer.h b/thrust/thrust/detail/type_traits/is_thrust_pointer.h
new file mode 100644
index 00000000000..77dcb571259
--- /dev/null
+++ b/thrust/thrust/detail/type_traits/is_thrust_pointer.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+
+namespace pointer_traits_detail
+{
+
+template <typename Ptr, typename Enable = void>
+struct pointer_raw_pointer_impl
+{};
+
+template <typename T>
+struct pointer_raw_pointer_impl<T*>
+{
+  using type = T*;
+};
+
+template <typename Ptr>
+struct pointer_raw_pointer_impl<Ptr, ::cuda::std::void_t<typename Ptr::raw_pointer>>
+{
+  using type = typename Ptr::raw_pointer;
+};
+
+} // namespace pointer_traits_detail
+
+template <typename T>
+struct pointer_raw_pointer : pointer_traits_detail::pointer_raw_pointer_impl<T>
+{};
+
+// this could be a lot better, but for our purposes, it's probably
+// sufficient just to check if pointer_raw_pointer<T> has meaning
+template <typename T>
+struct is_thrust_pointer : is_metafunction_defined<pointer_raw_pointer<T>>
+{};
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/type_traits/pointer_traits.h b/thrust/thrust/detail/type_traits/pointer_traits.h
index 7570aba2242..baf4c0ca1dd 100644
--- a/thrust/thrust/detail/type_traits/pointer_traits.h
+++ b/thrust/thrust/detail/type_traits/pointer_traits.h
@@ -28,6 +28,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/detail/type_traits/is_thrust_pointer.h>
 #include <thrust/iterator/iterator_traits.h>
 
 #include <cstddef>
@@ -140,31 +141,6 @@ struct rebind_pointer<Ptr<OldT, Tag, typename std::add_lvalue_reference<OldT>::t
 namespace pointer_traits_detail
 {
 
-template <typename Ptr, typename Enable = void>
-struct pointer_raw_pointer_impl
-{};
-
-template <typename T>
-struct pointer_raw_pointer_impl<T*>
-{
-  using type = T*;
-};
-
-template <typename Ptr>
-struct pointer_raw_pointer_impl<Ptr, ::cuda::std::void_t<typename Ptr::raw_pointer>>
-{
-  using type = typename Ptr::raw_pointer;
-};
-
-} // namespace pointer_traits_detail
-
-template <typename T>
-struct pointer_raw_pointer : pointer_traits_detail::pointer_raw_pointer_impl<T>
-{};
-
-namespace pointer_traits_detail
-{
-
 template <typename Void>
 struct capture_address
 {
@@ -327,12 +303,6 @@ struct is_void_pointer_system_convertible
                         is_pointer_system_convertible<FromPtr, ToPtr>>
 {};
 
-// this could be a lot better, but for our purposes, it's probably
-// sufficient just to check if pointer_raw_pointer<T> has meaning
-template <typename T>
-struct is_thrust_pointer : is_metafunction_defined<pointer_raw_pointer<T>>
-{};
-
 // avoid inspecting traits of the arguments if they aren't known to be pointers
 template <typename FromPtr, typename ToPtr>
 struct lazy_is_pointer_convertible
diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h
index 268c67eb03c..3372b9eaf25 100644
--- a/thrust/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h
@@ -51,6 +51,7 @@
 #  include <thrust/system/cuda/detail/par_to_seq.h>
 #  include <thrust/system/cuda/detail/util.h>
 #  include <thrust/type_traits/is_contiguous_iterator.h>
+#  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #  include <cuda/std/type_traits>
 
diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h
new file mode 100644
index 00000000000..07c5eba0eaa
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h
@@ -0,0 +1,57 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub::core
+{
+
+// LoadIterator
+// ------------
+// if trivial iterator is passed, wrap loads into LDG
+//
+template <class PtxPlan, class It>
+struct LoadIterator
+{
+  using value_type = typename ::cuda::std::iterator_traits<It>::value_type;
+  using size_type  = typename ::cuda::std::iterator_traits<It>::difference_type;
+
+  using type =
+    ::cuda::std::conditional_t<is_contiguous_iterator_v<It>,
+                               cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
+                               It>;
+}; // struct Iterator
+} // namespace cuda_cub::core
+
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
new file mode 100644
index 00000000000..28c65c813ea
--- /dev/null
+++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/core/load_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub::core
+{
+template <class PtxPlan, class It>
+typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
+make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */)
+{
+  return raw_pointer_cast(&*it);
+}
+
+template <class PtxPlan, class It>
+typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
+make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
+{
+  return it;
+}
+
+template <class PtxPlan, class It>
+typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it)
+{
+  return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
+}
+
+} // namespace cuda_cub::core
+
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
index 186990f4a0b..56ee30c9947 100644
--- a/thrust/thrust/system/cuda/detail/core/util.h
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -43,6 +43,8 @@
 #include <cub/util_temporary_storage.cuh>
 
 #include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/core/load_iterator.h>
+#include <thrust/system/cuda/detail/core/make_load_iterator.h>
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/system_error.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
@@ -503,42 +505,6 @@ THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t
   }
 }
 
-// LoadIterator
-// ------------
-// if trivial iterator is passed, wrap loads into LDG
-//
-template <class PtxPlan, class It>
-struct LoadIterator
-{
-  using value_type = typename iterator_traits<It>::value_type;
-  using size_type  = typename iterator_traits<It>::difference_type;
-
-  using type =
-    ::cuda::std::conditional_t<is_contiguous_iterator<It>::value,
-                               cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
-                               It>;
-}; // struct Iterator
-
-template <class PtxPlan, class It>
-typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
-make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */)
-{
-  return raw_pointer_cast(&*it);
-}
-
-template <class PtxPlan, class It>
-typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
-make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
-{
-  return it;
-}
-
-template <class PtxPlan, class It>
-typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it)
-{
-  return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
-}
-
 template <class>
 struct get_arch;
 
diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h
index 1240783736c..313f69d7758 100644
--- a/thrust/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/thrust/system/cuda/detail/scan_by_key.h
@@ -53,6 +53,7 @@
 #  include <thrust/system/cuda/detail/par_to_seq.h>
 #  include <thrust/system/cuda/detail/util.h>
 #  include <thrust/type_traits/is_contiguous_iterator.h>
+#  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #  include <cstdint>
 
diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h
index 95f6c503b15..e254e572a3f 100644
--- a/thrust/thrust/type_traits/is_contiguous_iterator.h
+++ b/thrust/thrust/type_traits/is_contiguous_iterator.h
@@ -31,13 +31,9 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/type_traits/is_thrust_pointer.h>
 
-#include <iterator>
-#include <type_traits>
-#include <utility>
+#include <cuda/std/iterator> // Needed for __gnu_cxx::__normal_iterator
 
 #if _CCCL_COMPILER(MSVC, <, 19, 16) // MSVC 2017 version 15.9
 #  include <array>
@@ -212,71 +208,8 @@ struct is_contiguous_iterator_impl
           || is_msvc_contiguous_iterator<Iterator>::value || proclaim_contiguous_iterator<Iterator>::value>
 {};
 
-// Type traits for contiguous iterators:
-template <typename Iterator>
-struct contiguous_iterator_traits
-{
-  static_assert(thrust::is_contiguous_iterator<Iterator>::value,
-                "contiguous_iterator_traits requires a contiguous iterator.");
-
-  using raw_pointer = typename thrust::detail::pointer_traits<decltype(&*std::declval<Iterator>())>::raw_pointer;
-};
 } // namespace detail
 
-//! Converts a contiguous iterator type to its underlying raw pointer type.
-template <typename ContiguousIterator>
-using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits<ContiguousIterator>::raw_pointer;
-
-//! Converts a contiguous iterator to its underlying raw pointer.
-template <typename ContiguousIterator>
-_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
-  -> unwrap_contiguous_iterator_t<ContiguousIterator>
-{
-  static_assert(thrust::is_contiguous_iterator<ContiguousIterator>::value,
-                "unwrap_contiguous_iterator called with non-contiguous iterator.");
-  return thrust::raw_pointer_cast(&*it);
-}
-
-namespace detail
-{
-// Implementation for non-contiguous iterators -- passthrough.
-template <typename Iterator, bool IsContiguous = thrust::is_contiguous_iterator<Iterator>::value>
-struct try_unwrap_contiguous_iterator_impl
-{
-  using type = Iterator;
-
-  static _CCCL_HOST_DEVICE type get(Iterator it)
-  {
-    return it;
-  }
-};
-
-// Implementation for contiguous iterators -- unwraps to raw pointer.
-template <typename Iterator>
-struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
-{
-  using type = unwrap_contiguous_iterator_t<Iterator>;
-
-  static _CCCL_HOST_DEVICE type get(Iterator it)
-  {
-    return unwrap_contiguous_iterator(it);
-  }
-};
-} // namespace detail
-
-//! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the
-//! iterator type unmodified.
-template <typename Iterator>
-using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl<Iterator>::type;
-
-//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
-//! iterator unmodified.
-template <typename Iterator>
-_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t<Iterator>
-{
-  return detail::try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
-}
-
 /*! \endcond
  */
 
diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h
index 8566a510578..a53b297c438 100644
--- a/thrust/thrust/type_traits/is_trivially_relocatable.h
+++ b/thrust/thrust/type_traits/is_trivially_relocatable.h
@@ -34,6 +34,7 @@
 #endif // no system header
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/std/__fwd/pair.h>
diff --git a/thrust/thrust/type_traits/unwrap_contiguous_iterator.h b/thrust/thrust/type_traits/unwrap_contiguous_iterator.h
new file mode 100644
index 00000000000..880dc970759
--- /dev/null
+++ b/thrust/thrust/type_traits/unwrap_contiguous_iterator.h
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+// Type traits for contiguous iterators:
+template <typename Iterator>
+struct contiguous_iterator_traits
+{
+  static_assert(thrust::is_contiguous_iterator_v<Iterator>,
+                "contiguous_iterator_traits requires a contiguous iterator.");
+
+  using raw_pointer = typename thrust::detail::pointer_traits<decltype(&*std::declval<Iterator>())>::raw_pointer;
+};
+} // namespace detail
+
+//! Converts a contiguous iterator type to its underlying raw pointer type.
+template <typename ContiguousIterator>
+using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits<ContiguousIterator>::raw_pointer;
+
+//! Converts a contiguous iterator to its underlying raw pointer.
+template <typename ContiguousIterator>
+_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
+  -> unwrap_contiguous_iterator_t<ContiguousIterator>
+{
+  static_assert(thrust::is_contiguous_iterator_v<ContiguousIterator>,
+                "unwrap_contiguous_iterator called with non-contiguous iterator.");
+  return thrust::raw_pointer_cast(&*it);
+}
+
+namespace detail
+{
+// Implementation for non-contiguous iterators -- passthrough.
+template <typename Iterator, bool IsContiguous = thrust::is_contiguous_iterator_v<Iterator>>
+struct try_unwrap_contiguous_iterator_impl
+{
+  using type = Iterator;
+
+  static _CCCL_HOST_DEVICE type get(Iterator it)
+  {
+    return it;
+  }
+};
+
+// Implementation for contiguous iterators -- unwraps to raw pointer.
+template <typename Iterator>
+struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
+{
+  using type = unwrap_contiguous_iterator_t<Iterator>;
+
+  static _CCCL_HOST_DEVICE type get(Iterator it)
+  {
+    return unwrap_contiguous_iterator(it);
+  }
+};
+} // namespace detail
+
+//! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the
+//! iterator type unmodified.
+template <typename Iterator>
+using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl<Iterator>::type;
+
+//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
+//! iterator unmodified.
+template <typename Iterator>
+_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t<Iterator>
+{
+  return detail::try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
+}
+
+THRUST_NAMESPACE_END