Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUB tests for segmented sort/radix sort with 64-bit num. items and segments #2254

Merged
merged 51 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
5bff5ed
add segmented [radix] sort tests for 64-bit indices
fbusato Aug 15, 2024
62ac106
Add test for device segmented sort pairs with 64-bit indices
fbusato Aug 16, 2024
994da36
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 16, 2024
dd92bd9
Add a macro guard to protect tests that are not supported yet
fbusato Aug 27, 2024
06ef7c0
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 28, 2024
cf14d60
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 29, 2024
ea067ad
Extend segmented radix sort/sort tests to all offset types, added ove…
fbusato Aug 30, 2024
7fed29d
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 30, 2024
c22e7c1
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Aug 30, 2024
df5230e
Add very large segment case
fbusato Aug 30, 2024
c845c90
Merge branch 'cub/test/64bit_segmented_sort' of github.com:fbusato/cc…
fbusato Aug 30, 2024
dd5b776
Fix formatting issue
fbusato Aug 30, 2024
8d080c1
remove enable macro
fbusato Sep 3, 2024
7983212
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 3, 2024
51921f1
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 4, 2024
58a0fce
Update cub/test/catch2_test_device_segmented_radix_sort_keys.cu
fbusato Sep 6, 2024
17b3fef
Update cub/test/catch2_test_device_segmented_radix_sort_keys.cu
fbusato Sep 6, 2024
24677e5
Fix potential bug for is_override
fbusato Sep 6, 2024
5dd9464
fix formatting issues
fbusato Sep 6, 2024
4db3a55
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 6, 2024
8944ff2
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 10, 2024
281dbc6
move reference computation before kernel call
fbusato Sep 10, 2024
27bc2d0
fix pointer references for correctness checking
fbusato Sep 10, 2024
88efaea
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 10, 2024
9103bbd
remove redundant test cases
fbusato Sep 10, 2024
4eaa1dd
Merge branch 'cub/test/64bit_segmented_sort' of github.com:fbusato/cc…
fbusato Sep 10, 2024
cce6253
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 11, 2024
2166bb7
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
a42cbc6
Update cub/test/catch2_test_device_segmented_sort_pairs.cu
fbusato Sep 20, 2024
c45de31
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 20, 2024
19caa47
Update cub/test/catch2_test_device_segmented_sort_pairs.cu
fbusato Sep 20, 2024
5c056f0
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
4c9be63
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
fd34b05
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
fe82e30
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
7bef872
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
6490db8
Update cub/test/catch2_test_device_segmented_sort_pairs.cu
fbusato Sep 20, 2024
c983761
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
284b998
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
4e9e852
Update cub/test/catch2_test_device_segmented_sort_keys.cu
fbusato Sep 20, 2024
783beb0
Update cub/test/catch2_test_device_segmented_sort_pairs.cu
fbusato Sep 20, 2024
f35fcc0
Update cub/test/catch2_test_device_segmented_sort_pairs.cu
fbusato Sep 20, 2024
e11b57d
propagate cuda managed memory pointer selector
fbusato Sep 21, 2024
0866e80
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 25, 2024
10f656b
fix run-time bugs
fbusato Sep 25, 2024
3d559ae
Merge branch 'cub/test/64bit_segmented_sort' of github.com:fbusato/cc…
fbusato Sep 25, 2024
17d3840
fix formatting
fbusato Sep 25, 2024
b21b78e
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 25, 2024
34684df
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 25, 2024
459a77a
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 26, 2024
f619a3d
Merge branch 'main' into cub/test/64bit_segmented_sort
fbusato Sep 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions cub/test/catch2_radix_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#pragma once

// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_segmented_radix_sort.cuh>
#include <cub/util_macro.cuh>
Expand All @@ -50,6 +51,49 @@
#include "c2h/vector.cuh"
#include "catch2_test_helper.h"

// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
using all_offset_types =
c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;

// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
template <typename OffsetT, OffsetT Step>
struct segment_iterator
{
OffsetT last = 0;

segment_iterator(OffsetT last1)
: last{last1}
{}

__host__ __device__ OffsetT operator()(OffsetT x) const
{
switch (x)
{
case Step * 100:
return Step * 100 + Step / 2;
case Step * 200:
return Step * 200 + Step / 2;
case Step * 300:
return Step * 300 + Step / 2;
case Step * 400:
return Step * 400 + Step / 2;
case Step * 500:
return Step * 500 + Step / 2;
case Step * 600:
return Step * 600 + Step / 2;
case Step * 700:
return Step * 700 + Step / 2;
case Step * 800:
return Step * 800 + Step / 2;
case Step * 900:
return Step * 900 + Step / 2;
default:
return (x >= last) ? last : x * Step;
}
}
};

// The launchers defined in catch2_test_launch_helper.h do not support
// passing objects by reference since the device-launch tests cannot
// pass references to a __global__ function. The DoubleBuffer object
Expand Down Expand Up @@ -446,7 +490,7 @@ std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort
}

template <typename OffsetT>
struct offset_scan_op_t
struct radix_offset_scan_op_t
{
OffsetT num_items;

Expand All @@ -470,5 +514,5 @@ void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& off
offsets.end(),
offsets.begin(),
OffsetT{0},
offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
radix_offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
}
2 changes: 1 addition & 1 deletion cub/test/catch2_segmented_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
#include <cub/device/device_segmented_sort.cuh>

#include <thrust/device_ptr.h>
Expand Down
223 changes: 217 additions & 6 deletions cub/test/catch2_test_device_segmented_radix_sort_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#include "insert_nested_NVTX_range_guard.h"
// above header needs to be included first

#include <cub/device/device_segmented_radix_sort.cuh>
#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
#include <cub/util_type.cuh>

#include <thrust/functional.h>
Expand All @@ -46,12 +46,64 @@
#include "catch2_test_helper.h"
#include "catch2_test_launch_helper.h"

// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
// types
template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
void* d_temp_storage,
size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
NumItemsT num_items,
NumItemsT num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
bool* selector,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
bool is_overwrite = true,
cudaStream_t stream = 0)
{
cub::DoubleBuffer<cub::NullType> d_values;
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
auto status = cub::DispatchSegmentedRadixSort<
IS_DESCENDING,
KeyT,
cub::NullType,
BeginOffsetIteratorT,
EndOffsetIteratorT, //
NumItemsT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
is_overwrite,
elstehle marked this conversation as resolved.
Show resolved Hide resolved
stream);
if (status != cudaSuccess)
{
return status;
}
if (is_overwrite)
{
// Only write to selector in the DoubleBuffer invocation
*selector = d_keys.Current() != d_keys_out;
}
return cudaSuccess;
}

// %PARAM% TEST_LAUNCH lid 0:1:2
// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
fbusato marked this conversation as resolved.
Show resolved Hide resolved

DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);

// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);

// TODO:
// - int128
Expand All @@ -62,6 +114,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_k
using key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
# define NO_FP_KEY_TYPES
# define SINGLE_TEST_CASE_INSTANTIATION
#elif TEST_KEY_BITS == 16
// clang-format off
using key_types = c2h::type_list<
Expand Down Expand Up @@ -90,9 +143,6 @@ using fp_key_types = c2h::type_list<double>;
// Used for tests that just need a single type for testing:
using single_key_type = c2h::type_list<c2h::get<0, key_types>>;

// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
"[keys][segmented][radix][sort][device]",
key_types,
Expand Down Expand Up @@ -153,6 +203,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
REQUIRE((ref_keys == out_keys) == true);
}

#if defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type)
{
using key_t = c2h::get<0, TestType>;
Expand Down Expand Up @@ -207,6 +259,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][ra
REQUIRE((ref_keys == out_keys) == true);
}

#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
"[keys][segmented][radix][sort][device]",
bit_window_key_types)
Expand Down Expand Up @@ -276,6 +330,8 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
REQUIRE((ref_keys == out_keys) == true);
}

#if defined(SINGLE_TEST_CASE_INSTANTIATION)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type)
{
using key_t = c2h::get<0, TestType>;
Expand Down Expand Up @@ -457,3 +513,158 @@ CUB_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges",

REQUIRE((ref_keys == out_keys) == true);
}

# if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments",
"[keys][segmented][radix][sort][device]",
all_offset_types)
try
{
fbusato marked this conversation as resolved.
Show resolved Hide resolved
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t step = 500;
using segment_iterator_t = segment_iterator<offset_t, step>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = ::cuda::ceil_div(num_items, step);
CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);

c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(CUB_SEED(num_key_seeds), in_keys);
auto offsets =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
auto offsets_plus_1 = offsets + 1;
// Allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}

auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_radix_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
else
{
dispatch_segmented_radix_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
offsets,
offsets_plus_1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
}
REQUIRE(ref_keys == out_keys);
}
catch (std::bad_alloc& e)
{
std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
}

CUB_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments",
"[keys][segmented][radix][sort][device]",
all_offset_types)
try
{
using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
using offset_t = c2h::get<0, TestType>;
constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
constexpr int num_key_seeds = 1;
const bool is_descending = GENERATE(false, true);
const bool is_overwrite = GENERATE(false, true);
constexpr std::size_t num_items =
(sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
const std::size_t num_segments = 2;
CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);

c2h::device_vector<key_t> in_keys(num_items);
c2h::device_vector<key_t> out_keys(num_items);
c2h::gen(CUB_SEED(num_key_seeds), in_keys);
c2h::device_vector<offset_t> offsets(num_segments + 1);
offsets[0] = 0;
offsets[1] = static_cast<offset_t>(num_items);
offsets[2] = static_cast<offset_t>(num_items);
// Allocate host/device-accessible memory to communicate the selected output buffer
bool* selector_ptr = nullptr;
if (is_overwrite)
{
REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
}
auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
if (is_descending)
{
dispatch_segmented_radix_sort_descending(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
else
{
dispatch_segmented_radix_sort(
thrust::raw_pointer_cast(in_keys.data()),
out_keys_ptr,
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_segments),
thrust::raw_pointer_cast(offsets.data()),
offsets.cbegin() + 1,
selector_ptr,
begin_bit<key_t>(),
end_bit<key_t>(),
is_overwrite);
}
if (is_overwrite)
{
if (*selector_ptr)
{
std::swap(out_keys, in_keys);
}
REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
}
REQUIRE(ref_keys == out_keys);
}
catch (std::bad_alloc& e)
{
std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
}

# endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
Loading
Loading