Add CUB tests for segmented sort/radix sort with 64-bit num. items an…

…d segments (#2254)
NVIDIA · Sep 26, 2024 · 5d45850 · 5d45850
1 parent 99fb4b4
commit 5d45850
Show file tree

Hide file tree

Showing 6 changed files with 937 additions and 16 deletions.
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
@@ -27,6 +27,7 @@
 
 #pragma once
 
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include <cub/util_macro.cuh>
@@ -50,6 +51,49 @@
 #include "c2h/vector.cuh"
 #include "catch2_test_helper.h"
 
+// Index types used for OffsetsT testing
+using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
+using all_offset_types =
+  c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
+
+// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
+template <typename OffsetT, OffsetT Step>
+struct segment_iterator
+{
+  OffsetT last = 0;
+
+  segment_iterator(OffsetT last1)
+      : last{last1}
+  {}
+
+  __host__ __device__ OffsetT operator()(OffsetT x) const
+  {
+    switch (x)
+    {
+      case Step * 100:
+        return Step * 100 + Step / 2;
+      case Step * 200:
+        return Step * 200 + Step / 2;
+      case Step * 300:
+        return Step * 300 + Step / 2;
+      case Step * 400:
+        return Step * 400 + Step / 2;
+      case Step * 500:
+        return Step * 500 + Step / 2;
+      case Step * 600:
+        return Step * 600 + Step / 2;
+      case Step * 700:
+        return Step * 700 + Step / 2;
+      case Step * 800:
+        return Step * 800 + Step / 2;
+      case Step * 900:
+        return Step * 900 + Step / 2;
+      default:
+        return (x >= last) ? last : x * Step;
+    }
+  }
+};
+
 // The launchers defined in catch2_test_launch_helper.h do not support
 // passing objects by reference since the device-launch tests cannot
 // pass references to a __global__ function. The DoubleBuffer object
@@ -446,7 +490,7 @@ std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort
 }
 
 template <typename OffsetT>
-struct offset_scan_op_t
+struct radix_offset_scan_op_t
 {
   OffsetT num_items;
 
@@ -470,5 +514,5 @@ void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& off
     offsets.end(),
     offsets.begin(),
     OffsetT{0},
-    offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
+    radix_offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
 }
diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
@@ -24,9 +24,9 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #pragma once
 
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
 #include <cub/device/device_segmented_sort.cuh>
 
 #include <thrust/device_ptr.h>

diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
@@ -24,11 +24,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
 #include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
 #include <cub/util_type.cuh>
 
 #include <thrust/functional.h>
@@ -46,12 +46,64 @@
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
+// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
+// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
+// types
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  int begin_bit       = 0,
+  int end_bit         = sizeof(KeyT) * 8,
+  bool is_overwrite   = true,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<cub::NullType> d_values;
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  auto status = cub::DispatchSegmentedRadixSort<
+    IS_DESCENDING,
+    KeyT,
+    cub::NullType,
+    BeginOffsetIteratorT,
+    EndOffsetIteratorT, //
+    NumItemsT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         is_overwrite,
+                         stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);
-
-// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);
 
 // TODO:
 // - int128
@@ -62,6 +114,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_k
 using key_types            = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
 using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
 #  define NO_FP_KEY_TYPES
+#  define SINGLE_TEST_CASE_INSTANTIATION
 #elif TEST_KEY_BITS == 16
 // clang-format off
 using key_types = c2h::type_list<
@@ -90,9 +143,6 @@ using fp_key_types         = c2h::type_list<double>;
 // Used for tests that just need a single type for testing:
 using single_key_type = c2h::type_list<c2h::get<0, key_types>>;
 
-// Index types used for OffsetsT testing
-using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
-
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
          "[keys][segmented][radix][sort][device]",
          key_types,
@@ -153,6 +203,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type)
 {
   using key_t    = c2h::get<0, TestType>;
@@ -207,6 +259,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][ra
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
          "[keys][segmented][radix][sort][device]",
          bit_window_key_types)
@@ -276,6 +330,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
   REQUIRE((ref_keys == out_keys) == true);
 }
 
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
 CUB_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type)
 {
   using key_t    = c2h::get<0, TestType>;
@@ -457,3 +513,158 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges",
 
   REQUIRE((ref_keys == out_keys) == true);
 }
+
+#  if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(CUB_SEED(num_key_seeds), in_keys);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#  endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)