diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index 4bdcec3992a..0ea95f270a4 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -27,6 +27,7 @@
 
 #pragma once
 
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include <cub/util_macro.cuh>
@@ -50,6 +51,49 @@
 #include "c2h/vector.cuh"
 #include "catch2_test_helper.h"
 
+// Index types used for OffsetsT testing
+using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
+using all_offset_types =
+  c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
+
+// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
+template <typename OffsetT, OffsetT Step>
+struct segment_iterator
+{
+  OffsetT last = 0;
+
+  segment_iterator(OffsetT last1)
+      : last{last1}
+  {}
+
+  __host__ __device__ OffsetT operator()(OffsetT x) const
+  {
+    switch (x)
+    {
+      case Step * 100:
+        return Step * 100 + Step / 2;
+      case Step * 200:
+        return Step * 200 + Step / 2;
+      case Step * 300:
+        return Step * 300 + Step / 2;
+      case Step * 400:
+        return Step * 400 + Step / 2;
+      case Step * 500:
+        return Step * 500 + Step / 2;
+      case Step * 600:
+        return Step * 600 + Step / 2;
+      case Step * 700:
+        return Step * 700 + Step / 2;
+      case Step * 800:
+        return Step * 800 + Step / 2;
+      case Step * 900:
+        return Step * 900 + Step / 2;
+      default:
+        return (x >= last) ? last : x * Step;
+    }
+  }
+};
+
 // The launchers defined in catch2_test_launch_helper.h do not support
 // passing objects by reference since the device-launch tests cannot
 // pass references to a __global__ function. The DoubleBuffer object
@@ -446,7 +490,7 @@ std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort
 }
 
 template <typename OffsetT>
-struct offset_scan_op_t
+struct radix_offset_scan_op_t
 {
   OffsetT num_items;
 
@@ -470,5 +514,5 @@ void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& off
     offsets.end(),
     offsets.begin(),
     OffsetT{0},
-    offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
+    radix_offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
 }
diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
index 48929143055..401f02d71ea 100644
--- a/cub/test/catch2_segmented_sort_helper.cuh
+++ b/cub/test/catch2_segmented_sort_helper.cuh
@@ -24,9 +24,9 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #pragma once
 
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
 #include <cub/device/device_segmented_sort.cuh>
 
 #include <thrust/device_ptr.h>
diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
index 70c5a63f2f8..01c92c8456f 100644
--- a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
@@ -24,11 +24,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
 #include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
 #include <cub/util_type.cuh>
 
 #include <thrust/functional.h>
@@ -46,12 +46,64 @@
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
+// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
+// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
+// types
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  int begin_bit       = 0,
+  int end_bit         = sizeof(KeyT) * 8,
+  bool is_overwrite   = true,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<cub::NullType> d_values;
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  auto status = cub::DispatchSegmentedRadixSort<
+    IS_DESCENDING,
+    KeyT,
+    cub::NullType,
+    BeginOffsetIteratorT,
+    EndOffsetIteratorT, //
+    NumItemsT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         is_overwrite,
+                         stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);
-
-// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);
 
 // TODO:
 // - int128
@@ -62,6 +114,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_k
 using key_types            = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
 using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
 #  define NO_FP_KEY_TYPES
+#  define SINGLE_TEST_CASE_INSTANTIATION
 #elif TEST_KEY_BITS == 16
 // clang-format off
 using key_types = c2h::type_list<
@@ -90,9 +143,6 @@ using fp_key_types         = c2h::type_list<double>;
 // Used for tests that just need a single type for testing:
 using single_key_type = c2h::type_list<c2h::get<0, key_types>>;
 
-// Index types used for OffsetsT testing
-using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
-
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
          "[keys][segmented][radix][sort][device]",
          key_types,
@@ -153,6 +203,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type)
 {
   using key_t    = c2h::get<0, TestType>;
@@ -207,6 +259,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][ra
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
          "[keys][segmented][radix][sort][device]",
          bit_window_key_types)
@@ -276,6 +330,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type)
 {
   using key_t    = c2h::get<0, TestType>;
@@ -457,3 +513,158 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges",
 
   REQUIRE((ref_keys == out_keys) == true);
 }
+
+#  if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#  endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
diff --git a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
index 10237a6460b..b4198ade6e2 100644
--- a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
@@ -24,11 +24,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
 #include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
 #include <cub/util_type.cuh>
 
 #include <thrust/iterator/constant_iterator.h>
@@ -43,17 +43,76 @@
 #include "catch2_test_launch_helper.h"
 #include "thrust/detail/raw_pointer_cast.h"
 
+// TODO replace with DeviceSegmentedRadixSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is
+// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
+// types
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename NumItemsT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_pairs_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  const ValueT* d_values_in,
+  ValueT* d_values_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  int begin_bit       = 0,
+  int end_bit         = sizeof(KeyT) * 8,
+  bool is_overwrite   = true,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+  auto status = cub::DispatchSegmentedRadixSort<
+    IS_DESCENDING,
+    KeyT,
+    ValueT,
+    BeginOffsetIteratorT,
+    EndOffsetIteratorT, //
+    NumItemsT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         is_overwrite,
+                         stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairs, sort_pairs);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairsDescending, sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<true>,
+                       dispatch_segmented_radix_sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<false>, dispatch_segmented_radix_sort_pairs);
 
 using custom_value_t = c2h::custom_type_t<c2h::equal_comparable_t>;
 using value_types    = c2h::type_list<cuda::std::uint8_t, cuda::std::uint64_t, custom_value_t>;
 
 // Index types used for OffsetsT testing
-using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
-
 CUB_TEST("DeviceSegmentedRadixSort::SortPairs: Basic testing",
          "[pairs][segmented][radix][sort][device]",
          value_types,
@@ -278,3 +337,192 @@ CUB_TEST("DeviceSegmentedRadixSort::SortPairs: unspecified ranges",
   REQUIRE((ref_keys == out_keys) == true);
   REQUIRE((ref_values == out_values) == true);
 }
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+CUB_TEST("DeviceSegmentedRadixSort::SortPairs: very large num. items and num. segments",
+         "[pairs][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::gen(CUB_SEED(num_value_seeds), in_values);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+CUB_TEST("DeviceSegmentedRadixSort::SortPairs: very large segments",
+         "[pairs][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max : ::cuda::std::numeric_limits<offset_t>::max();
+  constexpr std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::gen(CUB_SEED(num_value_seeds), in_values);
+  c2h::device_vector<value_t> out_values(num_items);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(
+    in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (out_keys_ptr != thrust::raw_pointer_cast(out_keys.data()))
+  {
+    std::swap(out_keys, in_keys);
+    std::swap(out_values, in_values);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu
index 4be32f66c89..28404aade8f 100644
--- a/cub/test/catch2_test_device_segmented_sort_keys.cu
+++ b/cub/test/catch2_test_device_segmented_sort_keys.cu
@@ -24,18 +24,63 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
-
+#include "catch2_radix_sort_helper.cuh"
 #include <catch2_segmented_sort_helper.cuh>
 #include <catch2_test_helper.h>
 
 // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
 // graph launch.
 
+// TODO replace with DeviceSegmentedSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
+// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  bool is_overwrite   = false,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<cub::NullType> d_values;
+  auto status =
+    cub::DispatchSegmentedSort<IS_DESCENDING, KeyT, cub::NullType, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        is_overwrite,
+        stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1
 
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<true>, dispatch_segmented_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<false>, dispatch_segmented_sort);
+
 using key_types =
   c2h::type_list<bool,
                  std::uint8_t,
@@ -174,3 +219,149 @@ CUB_TEST("DeviceSegmentedSortKeys: Unspecified segments, random keys", "[keys][s
   using KeyT = c2h::get<0, TestType>;
   test_unspecified_segments_random<KeyT>(CUB_SEED(4));
 }
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity
+CUB_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments",
+         "[keys][segmented][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess);
+  }
+
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE((ref_keys == out_keys) == true);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+CUB_TEST("DeviceSegmentedSort::SortKeys: very large segments", "[keys][segmented][sort][device]", all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE((ref_keys == out_keys) == true);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu
index 144ea13e762..182a4cfd960 100644
--- a/cub/test/catch2_test_device_segmented_sort_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu
@@ -24,8 +24,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
-#include "insert_nested_NVTX_range_guard.h"
+#include "catch2_radix_sort_helper.cuh"
 // above header needs to be included first
 
 #include <catch2_segmented_sort_helper.cuh>
@@ -34,8 +33,61 @@
 // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
 // graph launch.
 
+// TODO replace with DeviceSegmentedSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
+// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_pairs_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  const ValueT* d_values_in,
+  ValueT* d_values_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  bool is_overwrite   = false,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+  auto status = cub::
+    DispatchSegmentedSort<IS_DESCENDING, KeyT, ValueT, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite,
+      stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1
 
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<true>, dispatch_segmented_sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<false>, dispatch_segmented_sort_pairs);
+
 using pair_types =
   c2h::type_list<c2h::type_list<bool, std::uint8_t>,
                  c2h::type_list<std::int8_t, std::uint64_t>,
@@ -198,3 +250,178 @@ CUB_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values",
 
   test_unspecified_segments_random<KeyT, ValueT>(CUB_SEED(4));
 }
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity
+CUB_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments",
+         "[pairs][segmented][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::gen(CUB_SEED(num_value_seeds), in_values);
+
+  // Initialize the output vectors by copying the inputs since not all items may belong to a segment.
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  bool* selector_ptr  = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+CUB_TEST("DeviceSegmentedSort::SortPairs: very large segments", "[pairs][segmented][sort][device]", all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  constexpr std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::gen(CUB_SEED(num_value_seeds), in_values);
+  c2h::device_vector<value_t> out_values(num_items);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0]         = 0;
+  offsets[1]         = static_cast<offset_t>(num_items);
+  offsets[2]         = static_cast<offset_t>(num_items);
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(
+    in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)