From 5b6bf508e3c74e470579175dd0fa69a9ac984295 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Sun, 26 Jan 2025 23:05:32 -0800
Subject: [PATCH 1/3] adds support for large num items

---
 cub/cub/device/device_merge.cuh      |  18 ++-
 cub/test/catch2_test_device_merge.cu | 226 +++++++++------------------
 2 files changed, 82 insertions(+), 162 deletions(-)
diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh
index 7135546a0e6..814bad75248 100644
--- a/cub/cub/device/device_merge.cuh
+++ b/cub/cub/device/device_merge.cuh
@@ -76,16 +76,19 @@ struct DeviceMerge
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     KeyIteratorIn1 keys_in1,
-    int num_keys1,
+    ::cuda::std::int64_t num_keys1,
     KeyIteratorIn2 keys_in2,
-    int num_keys2,
+    ::cuda::std::int64_t num_keys2,
     KeyIteratorOut keys_out,
     CompareOp compare_op = {},
     cudaStream_t stream  = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+
+    using offset_t = ::cuda::std::int64_t;
+
     return detail::merge::
-      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, int, CompareOp>::
+      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, offset_t, CompareOp>::
         dispatch(
           d_temp_storage,
           temp_storage_bytes,
@@ -161,16 +164,19 @@ struct DeviceMerge
     std::size_t& temp_storage_bytes,
     KeyIteratorIn1 keys_in1,
     ValueIteratorIn1 values_in1,
-    int num_pairs1,
+    ::cuda::std::int64_t num_pairs1,
     KeyIteratorIn2 keys_in2,
     ValueIteratorIn2 values_in2,
-    int num_pairs2,
+    ::cuda::std::int64_t num_pairs2,
     KeyIteratorOut keys_out,
     ValueIteratorOut values_out,
     CompareOp compare_op = {},
     cudaStream_t stream  = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+
+    using offset_t = ::cuda::std::int64_t;
+
     return detail::merge::dispatch_t<
       KeyIteratorIn1,
       ValueIteratorIn1,
@@ -178,7 +184,7 @@ struct DeviceMerge
       ValueIteratorIn2,
       KeyIteratorOut,
       ValueIteratorOut,
-      int,
+      offset_t,
       CompareOp>::dispatch(d_temp_storage,
                            temp_storage_bytes,
                            keys_in1,
diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu
index ae0d3f84baa..dfb905bea80 100644
--- a/cub/test/catch2_test_device_merge.cu
+++ b/cub/test/catch2_test_device_merge.cu
@@ -20,103 +20,8 @@
 DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys);
 
-// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different
-// offset types
-namespace detail
-{
-template <typename KeyIteratorIn1,
-          typename KeyIteratorIn2,
-          typename KeyIteratorOut,
-          typename Offset,
-          typename CompareOp = ::cuda::std::less<>>
-CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  KeyIteratorIn1 keys_in1,
-  Offset num_keys1,
-  KeyIteratorIn2 keys_in2,
-  Offset num_keys2,
-  KeyIteratorOut keys_out,
-  CompareOp compare_op = {},
-  cudaStream_t stream  = 0)
-{
-  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
-  return cub::detail::merge::dispatch_t<
-    KeyIteratorIn1,
-    cub::NullType*,
-    KeyIteratorIn2,
-    cub::NullType*,
-    KeyIteratorOut,
-    cub::NullType*,
-    Offset,
-    CompareOp>::dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         keys_in1,
-                         nullptr,
-                         num_keys1,
-                         keys_in2,
-                         nullptr,
-                         num_keys2,
-                         keys_out,
-                         nullptr,
-                         compare_op,
-                         stream);
-}
-
-template <typename KeyIteratorIn1,
-          typename ValueIteratorIn1,
-          typename KeyIteratorIn2,
-          typename ValueIteratorIn2,
-          typename KeyIteratorOut,
-          typename ValueIteratorOut,
-          typename Offset,
-          typename CompareOp = ::cuda::std::less<>>
-CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  KeyIteratorIn1 keys_in1,
-  ValueIteratorIn1 values_in1,
-  Offset num_pairs1,
-  KeyIteratorIn2 keys_in2,
-  ValueIteratorIn2 values_in2,
-  Offset num_pairs2,
-  KeyIteratorOut keys_out,
-  ValueIteratorOut values_out,
-  CompareOp compare_op = {},
-  cudaStream_t stream  = 0)
-{
-  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
-  return cub::detail::merge::dispatch_t<
-    KeyIteratorIn1,
-    ValueIteratorIn1,
-    KeyIteratorIn2,
-    ValueIteratorIn2,
-    KeyIteratorOut,
-    ValueIteratorOut,
-    Offset,
-    CompareOp>::dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         keys_in1,
-                         values_in1,
-                         num_pairs1,
-                         keys_in2,
-                         values_in2,
-                         num_pairs2,
-                         keys_out,
-                         values_out,
-                         compare_op,
-                         stream);
-}
-} // namespace detail
-
-DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type);
-DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type);
-
 using types = c2h::type_list<std::uint8_t, std::int16_t, std::uint32_t, double>;
 
-// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time
-using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
-
 template <typename Key,
           typename Offset,
           typename CompareOp = ::cuda::std::less<Key>,
@@ -172,62 +77,79 @@ struct fallback_test_policy_hub
 };
 
 // TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on?
-C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem, large_type_fallb>)
+// C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem,
+// large_type_fallb>)
+// {
+//   using key_t    = c2h::get<0, TestType>;
+//   using offset_t = int;
+
+//   constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
+//   constexpr auto fallback_sm =
+//     sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
+//   static_assert(agent_sm > cub::detail::max_smem_per_block,
+//                 "key_t is not big enough to exceed SM and trigger fallback policy");
+//   static_assert(
+//     ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
+//     "SM consumption by fallback policy should fit into max_smem_per_block");
+
+//   test_keys<key_t, offset_t>(
+//     3623,
+//     6346,
+//     ::cuda::std::less<key_t>{},
+//     [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
+//       using dispatch_t = cub::detail::merge::dispatch_t<
+//         const key_t*,
+//         const cub::NullType*,
+//         const key_t*,
+//         const cub::NullType*,
+//         key_t*,
+//         cub::NullType*,
+//         offset_t,
+//         ::cuda::std::less<key_t>,
+//         fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
+
+//       std::size_t temp_storage_bytes = 0;
+//       dispatch_t::dispatch(
+//         nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
+
+//       c2h::device_vector<char> temp_storage(temp_storage_bytes);
+//       dispatch_t::dispatch(
+//         thrust::raw_pointer_cast(temp_storage.data()),
+//         temp_storage_bytes,
+//         k1,
+//         nullptr,
+//         s1,
+//         k2,
+//         nullptr,
+//         s2,
+//         r,
+//         nullptr,
+//         co,
+//         cudaStream_t{0});
+//     });
+// }
+
+C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]")
+
+try
 {
-  using key_t    = c2h::get<0, TestType>;
-  using offset_t = int;
+  using key_t    = char;
+  using offset_t = int64_t;
 
-  constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
-  constexpr auto fallback_sm =
-    sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
-  static_assert(agent_sm > cub::detail::max_smem_per_block,
-                "key_t is not big enough to exceed SM and trigger fallback policy");
-  static_assert(
-    ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
-    "SM consumption by fallback policy should fit into max_smem_per_block");
-
-  test_keys<key_t, offset_t>(
-    3623,
-    6346,
-    ::cuda::std::less<key_t>{},
-    [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
-      using dispatch_t = cub::detail::merge::dispatch_t<
-        const key_t*,
-        const cub::NullType*,
-        const key_t*,
-        const cub::NullType*,
-        key_t*,
-        cub::NullType*,
-        offset_t,
-        ::cuda::std::less<key_t>,
-        fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
-
-      std::size_t temp_storage_bytes = 0;
-      dispatch_t::dispatch(
-        nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
-
-      c2h::device_vector<char> temp_storage(temp_storage_bytes);
-      dispatch_t::dispatch(
-        thrust::raw_pointer_cast(temp_storage.data()),
-        temp_storage_bytes,
-        k1,
-        nullptr,
-        s1,
-        k2,
-        nullptr,
-        s2,
-        r,
-        nullptr,
-        co,
-        cudaStream_t{0});
-    });
-}
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  const auto num_items_int_max = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
 
-C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types)
+  // Generate the input sizes to test for
+  const offset_t num_items_lhs =
+    GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max - 1, offset_t{3}}));
+  const offset_t num_items_rhs =
+    GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max, offset_t{3}}));
+
+  test_keys<key_t, offset_t>(num_items_lhs, num_items_rhs, ::cuda::std::less<>{});
+}
+catch (const std::bad_alloc&)
 {
-  using key_t    = int;
-  using offset_t = c2h::get<0, TestType>;
-  test_keys<key_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type);
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
 }
 
 C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]")
@@ -385,14 +307,6 @@ C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types)
   test_pairs<key_t, value_t, offset_t>();
 }
 
-C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types)
-{
-  using key_t    = int;
-  using value_t  = int;
-  using offset_t = c2h::get<0, TestType>;
-  test_pairs<key_t, value_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
-}
-
 C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]")
 {
   using key_t      = int;
@@ -410,7 +324,7 @@ try
   using key_t     = char;
   using value_t   = char;
   const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33);
-  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{});
 }
 catch (const std::bad_alloc&)
 {

From 8455c0166b54f3318d91097bcff25e70ce1854c8 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:41:18 -0800
Subject: [PATCH 2/3] re-enable vsmem tests

---
 cub/test/catch2_test_device_merge.cu | 99 ++++++++++++++--------------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu
index dfb905bea80..f069042fd2f 100644
--- a/cub/test/catch2_test_device_merge.cu
+++ b/cub/test/catch2_test_device_merge.cu
@@ -77,57 +77,56 @@ struct fallback_test_policy_hub
 };
 
 // TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on?
-// C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem,
-// large_type_fallb>)
-// {
-//   using key_t    = c2h::get<0, TestType>;
-//   using offset_t = int;
+C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem, large_type_fallb>)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = int;
 
-//   constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
-//   constexpr auto fallback_sm =
-//     sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
-//   static_assert(agent_sm > cub::detail::max_smem_per_block,
-//                 "key_t is not big enough to exceed SM and trigger fallback policy");
-//   static_assert(
-//     ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
-//     "SM consumption by fallback policy should fit into max_smem_per_block");
-
-//   test_keys<key_t, offset_t>(
-//     3623,
-//     6346,
-//     ::cuda::std::less<key_t>{},
-//     [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
-//       using dispatch_t = cub::detail::merge::dispatch_t<
-//         const key_t*,
-//         const cub::NullType*,
-//         const key_t*,
-//         const cub::NullType*,
-//         key_t*,
-//         cub::NullType*,
-//         offset_t,
-//         ::cuda::std::less<key_t>,
-//         fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
-
-//       std::size_t temp_storage_bytes = 0;
-//       dispatch_t::dispatch(
-//         nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
-
-//       c2h::device_vector<char> temp_storage(temp_storage_bytes);
-//       dispatch_t::dispatch(
-//         thrust::raw_pointer_cast(temp_storage.data()),
-//         temp_storage_bytes,
-//         k1,
-//         nullptr,
-//         s1,
-//         k2,
-//         nullptr,
-//         s2,
-//         r,
-//         nullptr,
-//         co,
-//         cudaStream_t{0});
-//     });
-// }
+  constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
+  constexpr auto fallback_sm =
+    sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
+  static_assert(agent_sm > cub::detail::max_smem_per_block,
+                "key_t is not big enough to exceed SM and trigger fallback policy");
+  static_assert(
+    ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
+    "SM consumption by fallback policy should fit into max_smem_per_block");
+
+  test_keys<key_t, offset_t>(
+    3623,
+    6346,
+    ::cuda::std::less<key_t>{},
+    [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
+      using dispatch_t = cub::detail::merge::dispatch_t<
+        const key_t*,
+        const cub::NullType*,
+        const key_t*,
+        const cub::NullType*,
+        key_t*,
+        cub::NullType*,
+        offset_t,
+        ::cuda::std::less<key_t>,
+        fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
+
+      std::size_t temp_storage_bytes = 0;
+      dispatch_t::dispatch(
+        nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
+
+      c2h::device_vector<char> temp_storage(temp_storage_bytes);
+      dispatch_t::dispatch(
+        thrust::raw_pointer_cast(temp_storage.data()),
+        temp_storage_bytes,
+        k1,
+        nullptr,
+        s1,
+        k2,
+        nullptr,
+        s2,
+        r,
+        nullptr,
+        co,
+        cudaStream_t{0});
+    });
+}
 
 C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]")
 

From b4f8a2e5f1d9b0e2dc20e2040112450e28b73aff Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 27 Jan 2025 08:39:09 -0800
Subject: [PATCH 3/3] rephrases test description

---
 cub/test/catch2_test_device_merge.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu
index f069042fd2f..4835f597710 100644
--- a/cub/test/catch2_test_device_merge.cu
+++ b/cub/test/catch2_test_device_merge.cu
@@ -128,7 +128,7 @@ C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_
     });
 }
 
-C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]")
+C2H_TEST("DeviceMerge::MergeKeys works for large number of items", "[merge][device]")
 
 try
 {