Skip to content

Commit

Permalink
Add CUB tests for segmented sort/radix sort with 64-bit num. items an…
Browse files Browse the repository at this point in the history
…d segments (#2254)
  • Loading branch information
fbusato authored Sep 26, 2024
1 parent 99fb4b4 commit 5d45850
Show file tree
Hide file tree
Showing 6 changed files with 937 additions and 16 deletions.
48 changes: 46 additions & 2 deletions cub/test/catch2_radix_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#pragma once

// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_segmented_radix_sort.cuh>
#include <cub/util_macro.cuh>
Expand All @@ -50,6 +51,49 @@
#include "c2h/vector.cuh"
#include "catch2_test_helper.h"

// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
using all_offset_types =
c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;

// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
template <typename OffsetT, OffsetT Step>
struct segment_iterator
{
OffsetT last = 0;

segment_iterator(OffsetT last1)
: last{last1}
{}

__host__ __device__ OffsetT operator()(OffsetT x) const
{
switch (x)
{
case Step * 100:
return Step * 100 + Step / 2;
case Step * 200:
return Step * 200 + Step / 2;
case Step * 300:
return Step * 300 + Step / 2;
case Step * 400:
return Step * 400 + Step / 2;
case Step * 500:
return Step * 500 + Step / 2;
case Step * 600:
return Step * 600 + Step / 2;
case Step * 700:
return Step * 700 + Step / 2;
case Step * 800:
return Step * 800 + Step / 2;
case Step * 900:
return Step * 900 + Step / 2;
default:
return (x >= last) ? last : x * Step;
}
}
};

// The launchers defined in catch2_test_launch_helper.h do not support
// passing objects by reference since the device-launch tests cannot
// pass references to a __global__ function. The DoubleBuffer object
Expand Down Expand Up @@ -446,7 +490,7 @@ std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort
}

template <typename OffsetT>
struct offset_scan_op_t
struct radix_offset_scan_op_t
{
OffsetT num_items;

Expand All @@ -470,5 +514,5 @@ void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& off
offsets.end(),
offsets.begin(),
OffsetT{0},
offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
radix_offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
}
2 changes: 1 addition & 1 deletion cub/test/catch2_segmented_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
#include <cub/device/device_segmented_sort.cuh>

#include <thrust/device_ptr.h>
Expand Down
223 changes: 217 additions & 6 deletions cub/test/catch2_test_device_segmented_radix_sort_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#include "insert_nested_NVTX_range_guard.h"
// above header needs to be included first

#include <cub/device/device_segmented_radix_sort.cuh>
#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
#include <cub/util_type.cuh>

#include <thrust/functional.h>
Expand All @@ -46,12 +46,64 @@
#include "catch2_test_helper.h"
#include "catch2_test_launch_helper.h"

// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
// types
template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
void* d_temp_storage,
size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
NumItemsT num_items,
NumItemsT num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
bool* selector,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
bool is_overwrite = true,
cudaStream_t stream = 0)
{
cub::DoubleBuffer<cub::NullType> d_values;
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
auto status = cub::DispatchSegmentedRadixSort<
IS_DESCENDING,
KeyT,
cub::NullType,
BeginOffsetIteratorT,
EndOffsetIteratorT, //
NumItemsT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
is_overwrite,
stream);
if (status != cudaSuccess)
{
return status;
}
if (is_overwrite)
{
// Only write to selector in the DoubleBuffer invocation
*selector = d_keys.Current() != d_keys_out;
}
return cudaSuccess;
}

// %PARAM% TEST_LAUNCH lid 0:1:2
// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64

DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);

// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);

// TODO:
// - int128
Expand All @@ -62,6 +114,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_k
using key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
# define NO_FP_KEY_TYPES
# define SINGLE_TEST_CASE_INSTANTIATION
#elif TEST_KEY_BITS == 16
// clang-format off
using key_types = c2h::type_list<
Expand Down Expand Up @@ -90,9 +143,6 @@ using fp_key_types = c2h::type_list<double>;
// Used for tests that just need a single type for testing:
using single_key_type = c2h::type_list<c2h::get<0, key_types>>;

// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
"[keys][segmented][radix][sort][device]",
key_types,
Expand Down Expand Up @@ -153,6 +203,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
REQUIRE((ref_keys == out_keys) == true);
}

#if defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type)
{
using key_t = c2h::get<0, TestType>;
Expand Down Expand Up @@ -207,6 +259,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][ra
REQUIRE((ref_keys == out_keys) == true);
}

#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
"[keys][segmented][radix][sort][device]",
bit_window_key_types)
Expand Down Expand Up @@ -276,6 +330,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
REQUIRE((ref_keys == out_keys) == true);
}

#if defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type)
{
using key_t = c2h::get<0, TestType>;
Expand Down Expand Up @@ -457,3 +513,158 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges",

REQUIRE((ref_keys == out_keys) == true);
}

# if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments",
"[keys][segmented][radix][sort][device]",
all_offset_types)
try
{
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t step = 500;
using segment_iterator_t = segment_iterator<offset_t, step>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = ::cuda::ceil_div(num_items, step);
CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);

c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(CUB_SEED(num_key_seeds), in_keys);
auto offsets =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
auto offsets_plus_1 = offsets + 1;
// Allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}

auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_radix_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
else
{
dispatch_segmented_radix_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
}
REQUIRE(ref_keys == out_keys);
}
catch (std::bad_alloc& e)
{
std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
}

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments",
"[keys][segmented][radix][sort][device]",
all_offset_types)
try
{
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = 2;
CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);

c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(CUB_SEED(num_key_seeds), in_keys);
c2h::device_vector<offset_t> offsets(num_segments + 1);
offsets[0] = 0;
offsets[1] = static_cast<offset_t>(num_items);
offsets[2] = static_cast<offset_t>(num_items);
// Allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}
auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_radix_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
else
{
dispatch_segmented_radix_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
}
REQUIRE(ref_keys == out_keys);
}
catch (std::bad_alloc& e)
{
std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
}

# endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
Loading

0 comments on commit 5d45850

Please sign in to comment.