diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh index 4bdcec3992a..0ea95f270a4 100644 --- a/cub/test/catch2_radix_sort_helper.cuh +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -27,6 +27,7 @@ #pragma once +// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT #include #include #include @@ -50,6 +51,49 @@ #include "c2h/vector.cuh" #include "catch2_test_helper.h" +// Index types used for OffsetsT testing +using offset_types = c2h::type_list; +using all_offset_types = + c2h::type_list; + +// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory +template +struct segment_iterator +{ + OffsetT last = 0; + + segment_iterator(OffsetT last1) + : last{last1} + {} + + __host__ __device__ OffsetT operator()(OffsetT x) const + { + switch (x) + { + case Step * 100: + return Step * 100 + Step / 2; + case Step * 200: + return Step * 200 + Step / 2; + case Step * 300: + return Step * 300 + Step / 2; + case Step * 400: + return Step * 400 + Step / 2; + case Step * 500: + return Step * 500 + Step / 2; + case Step * 600: + return Step * 600 + Step / 2; + case Step * 700: + return Step * 700 + Step / 2; + case Step * 800: + return Step * 800 + Step / 2; + case Step * 900: + return Step * 900 + Step / 2; + default: + return (x >= last) ? last : x * Step; + } + } +}; + // The launchers defined in catch2_test_launch_helper.h do not support // passing objects by reference since the device-launch tests cannot // pass references to a __global__ function. The DoubleBuffer object @@ -446,7 +490,7 @@ std::pair, c2h::host_vector> segmented_radix_sort } template -struct offset_scan_op_t +struct radix_offset_scan_op_t { OffsetT num_items; @@ -470,5 +514,5 @@ void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector& off offsets.end(), offsets.begin(), OffsetT{0}, - offset_scan_op_t{static_cast(num_items)}); + radix_offset_scan_op_t{static_cast(num_items)}); } diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index 48929143055..401f02d71ea 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -24,9 +24,9 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #pragma once +// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT #include #include diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu index 70c5a63f2f8..01c92c8456f 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu @@ -24,11 +24,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first #include +#include // DispatchSegmentedRadixSort #include #include @@ -46,12 +46,64 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" +// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is +// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset +// types +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper( + void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + NumItemsT num_items, + NumItemsT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + bool* selector, + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, + bool is_overwrite = true, + cudaStream_t stream = 0) +{ + cub::DoubleBuffer d_values; + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + auto status = cub::DispatchSegmentedRadixSort< + IS_DESCENDING, + KeyT, + cub::NullType, + BeginOffsetIteratorT, + EndOffsetIteratorT, // + NumItemsT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + is_overwrite, + stream); + if (status != cudaSuccess) + { + return status; + } + if (is_overwrite) + { + // Only write to selector in the DoubleBuffer invocation + *selector = d_keys.Current() != d_keys_out; + } + return cudaSuccess; +} + // %PARAM% TEST_LAUNCH lid 0:1:2 +// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys); DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending); - -// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64 +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, dispatch_segmented_radix_sort_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, dispatch_segmented_radix_sort); // TODO: // - int128 @@ -62,6 +114,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_k using key_types = c2h::type_list; using bit_window_key_types = c2h::type_list; # define NO_FP_KEY_TYPES +# define SINGLE_TEST_CASE_INSTANTIATION #elif TEST_KEY_BITS == 16 // clang-format off using key_types = c2h::type_list< @@ -90,9 +143,6 @@ using fp_key_types = c2h::type_list; // Used for tests that just need a single type for testing: using single_key_type = c2h::type_list>; -// Index types used for OffsetsT testing -using offset_types = c2h::type_list; - CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing", "[keys][segmented][radix][sort][device]", key_types, @@ -153,6 +203,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing", REQUIRE((ref_keys == out_keys) == true); } +#if defined(SINGLE_TEST_CASE_INSTANTIATION) + CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type) { using key_t = c2h::get<0, TestType>; @@ -207,6 +259,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][ra REQUIRE((ref_keys == out_keys) == true); } +#endif // defined(SINGLE_TEST_CASE_INSTANTIATION) + CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows", "[keys][segmented][radix][sort][device]", bit_window_key_types) @@ -276,6 +330,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows", REQUIRE((ref_keys == out_keys) == true); } +#if defined(SINGLE_TEST_CASE_INSTANTIATION) + CUB_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type) { using key_t = c2h::get<0, TestType>; @@ -457,3 +513,158 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges", REQUIRE((ref_keys == out_keys) == true); } + +# if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) + +CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments", + "[keys][segmented][radix][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t step = 500; + using segment_iterator_t = segment_iterator; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = ::cuda::ceil_div(num_items, step); + CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + auto offsets = + thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + auto offsets_plus_1 = offsets + 1; + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + + auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1); + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + if (is_descending) + { + dispatch_segmented_radix_sort_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + else + { + dispatch_segmented_radix_sort( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + } + REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); + } + REQUIRE(ref_keys == out_keys); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments", + "[keys][segmented][radix][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::device_vector offsets(num_segments + 1); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets); + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + if (is_descending) + { + dispatch_segmented_radix_sort_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + else + { + dispatch_segmented_radix_sort( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + } + REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); + } + REQUIRE(ref_keys == out_keys); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +# endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) +#endif // defined(SINGLE_TEST_CASE_INSTANTIATION) diff --git a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu index 10237a6460b..b4198ade6e2 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu @@ -24,11 +24,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first #include +#include // DispatchSegmentedRadixSort #include #include @@ -43,17 +43,76 @@ #include "catch2_test_launch_helper.h" #include "thrust/detail/raw_pointer_cast.h" +// TODO replace with DeviceSegmentedRadixSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is +// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset +// types +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_pairs_wrapper( + void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + NumItemsT num_items, + NumItemsT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + bool* selector, + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, + bool is_overwrite = true, + cudaStream_t stream = 0) +{ + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + auto status = cub::DispatchSegmentedRadixSort< + IS_DESCENDING, + KeyT, + ValueT, + BeginOffsetIteratorT, + EndOffsetIteratorT, // + NumItemsT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + is_overwrite, + stream); + if (status != cudaSuccess) + { + return status; + } + if (is_overwrite) + { + // Only write to selector in the DoubleBuffer invocation + *selector = d_keys.Current() != d_keys_out; + } + return cudaSuccess; +} + // %PARAM% TEST_LAUNCH lid 0:1:2 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairs, sort_pairs); DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairsDescending, sort_pairs_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, + dispatch_segmented_radix_sort_pairs_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, dispatch_segmented_radix_sort_pairs); using custom_value_t = c2h::custom_type_t; using value_types = c2h::type_list; // Index types used for OffsetsT testing -using offset_types = c2h::type_list; - CUB_TEST("DeviceSegmentedRadixSort::SortPairs: Basic testing", "[pairs][segmented][radix][sort][device]", value_types, @@ -278,3 +337,192 @@ CUB_TEST("DeviceSegmentedRadixSort::SortPairs: unspecified ranges", REQUIRE((ref_keys == out_keys) == true); REQUIRE((ref_values == out_values) == true); } + +#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) + +CUB_TEST("DeviceSegmentedRadixSort::SortPairs: very large num. items and num. segments", + "[pairs][segmented][radix][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t Step = 500; + using segment_iterator_t = segment_iterator; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + constexpr int num_value_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); + CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector in_values(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + c2h::device_vector out_keys(num_items); + c2h::device_vector out_values(num_items); + auto offsets = + thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + auto offsets_plus_1 = offsets + 1; + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + + auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1); + auto& ref_keys = refs.first; + auto& ref_values = refs.second; + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); + if (is_descending) + { + dispatch_segmented_radix_sort_pairs_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + else + { + dispatch_segmented_radix_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + // Mix pointers/iterators for segment info to test using different iterable types: + offsets, + offsets_plus_1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); + } + REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); + } + REQUIRE(ref_keys == out_keys); + REQUIRE(ref_values == out_values); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +CUB_TEST("DeviceSegmentedRadixSort::SortPairs: very large segments", + "[pairs][segmented][radix][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + constexpr int num_value_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max : ::cuda::std::numeric_limits::max(); + constexpr std::size_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector in_values(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + c2h::device_vector out_values(num_items); + c2h::device_vector offsets(num_segments + 1); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + + auto refs = segmented_radix_sort_reference( + in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1); + auto& ref_keys = refs.first; + auto& ref_values = refs.second; + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); + if (is_descending) + { + dispatch_segmented_radix_sort_pairs_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + else + { + dispatch_segmented_radix_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + begin_bit(), + end_bit(), + is_overwrite); + } + if (out_keys_ptr != thrust::raw_pointer_cast(out_keys.data())) + { + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); + } + REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); + } + REQUIRE(ref_keys == out_keys); + REQUIRE(ref_values == out_values); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu index 4be32f66c89..28404aade8f 100644 --- a/cub/test/catch2_test_device_segmented_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_sort_keys.cu @@ -24,18 +24,63 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first - +#include "catch2_radix_sort_helper.cuh" #include #include // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for // graph launch. +// TODO replace with DeviceSegmentedSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is addressed +// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_wrapper( + void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + NumItemsT num_items, + NumItemsT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + bool* selector, + bool is_overwrite = false, + cudaStream_t stream = 0) +{ + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + cub::DoubleBuffer d_values; + auto status = + cub::DispatchSegmentedSort:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite, + stream); + if (status != cudaSuccess) + { + return status; + } + if (is_overwrite) + { + // Only write to selector in the DoubleBuffer invocation + *selector = d_keys.Current() != d_keys_out; + } + return cudaSuccess; +} + // %PARAM% TEST_LAUNCH lid 0:1 +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper, dispatch_segmented_sort_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper, dispatch_segmented_sort); + using key_types = c2h::type_list; test_unspecified_segments_random(CUB_SEED(4)); } + +#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) + +// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity +CUB_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments", + "[keys][segmented][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t Step = 500; + using segment_iterator_t = segment_iterator; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); + CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + auto offsets = + thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + auto offsets_plus_1 = offsets + 1; + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess); + } + + auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1); + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + if (is_descending) + { + dispatch_segmented_sort_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + is_overwrite); + } + else + { + dispatch_segmented_sort( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + } + REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); + } + REQUIRE((ref_keys == out_keys) == true); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +CUB_TEST("DeviceSegmentedSort::SortKeys: very large segments", "[keys][segmented][sort][device]", all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::device_vector offsets(num_segments + 1); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + + // Allocate host/device-accessible memory to communicate the selected output buffer + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets); + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + if (is_descending) + { + dispatch_segmented_sort_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + is_overwrite); + } + else + { + dispatch_segmented_sort( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + } + REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); + } + REQUIRE((ref_keys == out_keys) == true); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu index 144ea13e762..182a4cfd960 100644 --- a/cub/test/catch2_test_device_segmented_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu @@ -24,8 +24,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ - -#include "insert_nested_NVTX_range_guard.h" +#include "catch2_radix_sort_helper.cuh" // above header needs to be included first #include @@ -34,8 +33,61 @@ // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for // graph launch. +// TODO replace with DeviceSegmentedSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is addressed +// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_pairs_wrapper( + void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + NumItemsT num_items, + NumItemsT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + bool* selector, + bool is_overwrite = false, + cudaStream_t stream = 0) +{ + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + auto status = cub:: + DispatchSegmentedSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + is_overwrite, + stream); + if (status != cudaSuccess) + { + return status; + } + if (is_overwrite) + { + // Only write to selector in the DoubleBuffer invocation + *selector = d_keys.Current() != d_keys_out; + } + return cudaSuccess; +} + // %PARAM% TEST_LAUNCH lid 0:1 +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper, dispatch_segmented_sort_pairs_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper, dispatch_segmented_sort_pairs); + using pair_types = c2h::type_list, c2h::type_list, @@ -198,3 +250,178 @@ CUB_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values", test_unspecified_segments_random(CUB_SEED(4)); } + +#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) + +// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity +CUB_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments", + "[pairs][segmented][sort][device]", + all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t Step = 500; + using segment_iterator_t = segment_iterator; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + constexpr int num_value_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); + CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector in_values(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + + // Initialize the output vectors by copying the inputs since not all items may belong to a segment. + c2h::device_vector out_keys(num_items); + c2h::device_vector out_values(num_items); + auto offsets = + thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + auto offsets_plus_1 = offsets + 1; + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + + auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1); + auto& ref_keys = refs.first; + auto& ref_values = refs.second; + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); + if (is_descending) + { + dispatch_segmented_sort_pairs_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + is_overwrite); + } + else + { + dispatch_segmented_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1, + selector_ptr, + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); + } + REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); + } + REQUIRE(ref_keys == out_keys); + REQUIRE(ref_values == out_values); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +CUB_TEST("DeviceSegmentedSort::SortPairs: very large segments", "[pairs][segmented][sort][device]", all_offset_types) +try +{ + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using offset_t = c2h::get<0, TestType>; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); + constexpr int num_key_seeds = 1; + constexpr int num_value_seeds = 1; + const bool is_descending = GENERATE(false, true); + const bool is_overwrite = GENERATE(false, true); + constexpr std::size_t num_items = + (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); + constexpr std::size_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + + c2h::device_vector in_keys(num_items); + c2h::device_vector in_values(num_items); + c2h::device_vector out_keys(num_items); + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + c2h::device_vector out_values(num_items); + c2h::device_vector offsets(num_segments + 1); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + bool* selector_ptr = nullptr; + if (is_overwrite) + { + REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); + } + + auto refs = segmented_radix_sort_reference( + in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1); + auto& ref_keys = refs.first; + auto& ref_values = refs.second; + auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); + auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); + if (is_descending) + { + dispatch_segmented_sort_pairs_descending( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + is_overwrite); + } + else + { + dispatch_segmented_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + out_keys_ptr, + thrust::raw_pointer_cast(in_values.data()), + out_values_ptr, + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1, + selector_ptr, + is_overwrite); + } + if (is_overwrite) + { + if (*selector_ptr) + { + std::swap(out_keys, in_keys); + std::swap(out_values, in_values); + } + REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); + } + REQUIRE(ref_keys == out_keys); + REQUIRE(ref_values == out_values); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n"; +} + +#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)