From 5b6bf508e3c74e470579175dd0fa69a9ac984295 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 26 Jan 2025 23:05:32 -0800 Subject: [PATCH 1/3] adds support for large num items --- cub/cub/device/device_merge.cuh | 18 ++- cub/test/catch2_test_device_merge.cu | 226 +++++++++------------------ 2 files changed, 82 insertions(+), 162 deletions(-) diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh index 7135546a0e6..814bad75248 100644 --- a/cub/cub/device/device_merge.cuh +++ b/cub/cub/device/device_merge.cuh @@ -76,16 +76,19 @@ struct DeviceMerge void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, - int num_keys1, + ::cuda::std::int64_t num_keys1, KeyIteratorIn2 keys_in2, - int num_keys2, + ::cuda::std::int64_t num_keys2, KeyIteratorOut keys_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge:: - dispatch_t:: + dispatch_t:: dispatch( d_temp_storage, temp_storage_bytes, @@ -161,16 +164,19 @@ struct DeviceMerge std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, - int num_pairs1, + ::cuda::std::int64_t num_pairs1, KeyIteratorIn2 keys_in2, ValueIteratorIn2 values_in2, - int num_pairs2, + ::cuda::std::int64_t num_pairs2, KeyIteratorOut keys_out, ValueIteratorOut values_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge::dispatch_t< KeyIteratorIn1, ValueIteratorIn1, @@ -178,7 +184,7 @@ struct DeviceMerge ValueIteratorIn2, KeyIteratorOut, ValueIteratorOut, - int, + offset_t, CompareOp>::dispatch(d_temp_storage, temp_storage_bytes, keys_in1, diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu index ae0d3f84baa..dfb905bea80 100644 --- a/cub/test/catch2_test_device_merge.cu +++ b/cub/test/catch2_test_device_merge.cu @@ -20,103 +20,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs); DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys); -// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different -// offset types -namespace detail -{ -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - Offset num_keys1, - KeyIteratorIn2 keys_in2, - Offset num_keys2, - KeyIteratorOut keys_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - cub::NullType*, - KeyIteratorIn2, - cub::NullType*, - KeyIteratorOut, - cub::NullType*, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - nullptr, - num_keys1, - keys_in2, - nullptr, - num_keys2, - keys_out, - nullptr, - compare_op, - stream); -} - -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - ValueIteratorIn1 values_in1, - Offset num_pairs1, - KeyIteratorIn2 keys_in2, - ValueIteratorIn2 values_in2, - Offset num_pairs2, - KeyIteratorOut keys_out, - ValueIteratorOut values_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - ValueIteratorIn1, - KeyIteratorIn2, - ValueIteratorIn2, - KeyIteratorOut, - ValueIteratorOut, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - values_in1, - num_pairs1, - keys_in2, - values_in2, - num_pairs2, - keys_out, - values_out, - compare_op, - stream); -} -} // namespace detail - -DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type); -DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type); - using types = c2h::type_list; -// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time -using offset_types = c2h::type_list; - template , @@ -172,62 +77,79 @@ struct fallback_test_policy_hub }; // TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on? -C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list) +// C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list) +// { +// using key_t = c2h::get<0, TestType>; +// using offset_t = int; + +// constexpr auto agent_sm = sizeof(key_t) * 128 * 7; +// constexpr auto fallback_sm = +// sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD; +// static_assert(agent_sm > cub::detail::max_smem_per_block, +// "key_t is not big enough to exceed SM and trigger fallback policy"); +// static_assert( +// ::cuda::std::is_same::value == (fallback_sm <= cub::detail::max_smem_per_block), +// "SM consumption by fallback policy should fit into max_smem_per_block"); + +// test_keys( +// 3623, +// 6346, +// ::cuda::std::less{}, +// [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less co) { +// using dispatch_t = cub::detail::merge::dispatch_t< +// const key_t*, +// const cub::NullType*, +// const key_t*, +// const cub::NullType*, +// key_t*, +// cub::NullType*, +// offset_t, +// ::cuda::std::less, +// fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic + +// std::size_t temp_storage_bytes = 0; +// dispatch_t::dispatch( +// nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0}); + +// c2h::device_vector temp_storage(temp_storage_bytes); +// dispatch_t::dispatch( +// thrust::raw_pointer_cast(temp_storage.data()), +// temp_storage_bytes, +// k1, +// nullptr, +// s1, +// k2, +// nullptr, +// s2, +// r, +// nullptr, +// co, +// cudaStream_t{0}); +// }); +// } + +C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]") + +try { - using key_t = c2h::get<0, TestType>; - using offset_t = int; + using key_t = char; + using offset_t = int64_t; - constexpr auto agent_sm = sizeof(key_t) * 128 * 7; - constexpr auto fallback_sm = - sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD; - static_assert(agent_sm > cub::detail::max_smem_per_block, - "key_t is not big enough to exceed SM and trigger fallback policy"); - static_assert( - ::cuda::std::is_same::value == (fallback_sm <= cub::detail::max_smem_per_block), - "SM consumption by fallback policy should fit into max_smem_per_block"); - - test_keys( - 3623, - 6346, - ::cuda::std::less{}, - [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less co) { - using dispatch_t = cub::detail::merge::dispatch_t< - const key_t*, - const cub::NullType*, - const key_t*, - const cub::NullType*, - key_t*, - cub::NullType*, - offset_t, - ::cuda::std::less, - fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic - - std::size_t temp_storage_bytes = 0; - dispatch_t::dispatch( - nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0}); - - c2h::device_vector temp_storage(temp_storage_bytes); - dispatch_t::dispatch( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - k1, - nullptr, - s1, - k2, - nullptr, - s2, - r, - nullptr, - co, - cudaStream_t{0}); - }); -} + // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items + const auto num_items_int_max = static_cast(::cuda::std::numeric_limits::max()); -C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types) + // Generate the input sizes to test for + const offset_t num_items_lhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max - 1, offset_t{3}})); + const offset_t num_items_rhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max, offset_t{3}})); + + test_keys(num_items_lhs, num_items_rhs, ::cuda::std::less<>{}); +} +catch (const std::bad_alloc&) { - using key_t = int; - using offset_t = c2h::get<0, TestType>; - test_keys(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type); + // allocation failure is not a test failure, so we can run tests on smaller GPUs } C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]") @@ -385,14 +307,6 @@ C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types) test_pairs(); } -C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types) -{ - using key_t = int; - using value_t = int; - using offset_t = c2h::get<0, TestType>; - test_pairs(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); -} - C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]") { using key_t = int; @@ -410,7 +324,7 @@ try using key_t = char; using value_t = char; const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33); - test_pairs(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); + test_pairs(size, size, ::cuda::std::less<>{}); } catch (const std::bad_alloc&) { From 8455c0166b54f3318d91097bcff25e70ce1854c8 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 27 Jan 2025 07:41:18 -0800 Subject: [PATCH 2/3] re-enable vsmem tests --- cub/test/catch2_test_device_merge.cu | 99 ++++++++++++++-------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu index dfb905bea80..f069042fd2f 100644 --- a/cub/test/catch2_test_device_merge.cu +++ b/cub/test/catch2_test_device_merge.cu @@ -77,57 +77,56 @@ struct fallback_test_policy_hub }; // TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on? -// C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list) -// { -// using key_t = c2h::get<0, TestType>; -// using offset_t = int; +C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list) +{ + using key_t = c2h::get<0, TestType>; + using offset_t = int; -// constexpr auto agent_sm = sizeof(key_t) * 128 * 7; -// constexpr auto fallback_sm = -// sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD; -// static_assert(agent_sm > cub::detail::max_smem_per_block, -// "key_t is not big enough to exceed SM and trigger fallback policy"); -// static_assert( -// ::cuda::std::is_same::value == (fallback_sm <= cub::detail::max_smem_per_block), -// "SM consumption by fallback policy should fit into max_smem_per_block"); - -// test_keys( -// 3623, -// 6346, -// ::cuda::std::less{}, -// [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less co) { -// using dispatch_t = cub::detail::merge::dispatch_t< -// const key_t*, -// const cub::NullType*, -// const key_t*, -// const cub::NullType*, -// key_t*, -// cub::NullType*, -// offset_t, -// ::cuda::std::less, -// fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic - -// std::size_t temp_storage_bytes = 0; -// dispatch_t::dispatch( -// nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0}); - -// c2h::device_vector temp_storage(temp_storage_bytes); -// dispatch_t::dispatch( -// thrust::raw_pointer_cast(temp_storage.data()), -// temp_storage_bytes, -// k1, -// nullptr, -// s1, -// k2, -// nullptr, -// s2, -// r, -// nullptr, -// co, -// cudaStream_t{0}); -// }); -// } + constexpr auto agent_sm = sizeof(key_t) * 128 * 7; + constexpr auto fallback_sm = + sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD; + static_assert(agent_sm > cub::detail::max_smem_per_block, + "key_t is not big enough to exceed SM and trigger fallback policy"); + static_assert( + ::cuda::std::is_same::value == (fallback_sm <= cub::detail::max_smem_per_block), + "SM consumption by fallback policy should fit into max_smem_per_block"); + + test_keys( + 3623, + 6346, + ::cuda::std::less{}, + [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less co) { + using dispatch_t = cub::detail::merge::dispatch_t< + const key_t*, + const cub::NullType*, + const key_t*, + const cub::NullType*, + key_t*, + cub::NullType*, + offset_t, + ::cuda::std::less, + fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic + + std::size_t temp_storage_bytes = 0; + dispatch_t::dispatch( + nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0}); + + c2h::device_vector temp_storage(temp_storage_bytes); + dispatch_t::dispatch( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + k1, + nullptr, + s1, + k2, + nullptr, + s2, + r, + nullptr, + co, + cudaStream_t{0}); + }); +} C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]") From b4f8a2e5f1d9b0e2dc20e2040112450e28b73aff Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 27 Jan 2025 08:39:09 -0800 Subject: [PATCH 3/3] rephrases test description --- cub/test/catch2_test_device_merge.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu index f069042fd2f..4835f597710 100644 --- a/cub/test/catch2_test_device_merge.cu +++ b/cub/test/catch2_test_device_merge.cu @@ -128,7 +128,7 @@ C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_ }); } -C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]") +C2H_TEST("DeviceMerge::MergeKeys works for large number of items", "[merge][device]") try {