Fix identifier naming convention

NVIDIA · Jun 10, 2024 · 253b8fb · 253b8fb
1 parent 734a3f6
commit 253b8fb
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 197 deletions.
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
@@ -27,57 +27,47 @@
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
-template <int _BLOCK_THREADS,
-          int _ITEMS_PER_THREAD,
-          BlockLoadAlgorithm _LOAD_ALGORITHM,
-          CacheLoadModifier _LOAD_MODIFIER,
-          BlockStoreAlgorithm _STORE_ALGORITHM>
-struct AgentMergePolicy
-{
-  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
-  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
-  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
-  static constexpr CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
-  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-};
-
-/**
- * \brief This agent is responsible for partitioning a merge path into equal segments
- *
- * There are two sorted arrays to be merged into one array. If the first array
- * is partitioned between parallel workers by slicing it into ranges of equal
- * size, there could be a significant workload imbalance. The imbalance is
- * caused by the fact that the distribution of elements from the second array
- * is unknown beforehand. Instead, the MergePath is partitioned between workers.
- * This approach guarantees an equal amount of work being assigned to each worker.
- *
- * This approach is outlined in the paper:
- * Odeh et al, "Merge Path - Parallel Merging Made Simple"
- * doi:10.1109/IPDPSW.2012.202
- */
-// TODO(bgruber): can we unify this one with AgentParition in agent_merge_sort.cuh?
-template <typename KeysIt1, typename KeysIt2, typename OffsetT, typename CompareOp>
+// This agent computes the merge path intersections at equally wide intervals. The approach is outlined in the paper:
+// Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi : 10.1109 / IPDPSW .2012.202
+// The algorithm is the same as AgentPartition for merge sort, but that agent handles a lot more.
+template <typename KeysIt1, typename KeysIt2, typename Offset, typename CompareOp, int items_per_tile>
 struct AgentPartitionMergePath
 {
   KeysIt1 keys1;
-  OffsetT keys1_count;
+  Offset keys1_count;
   KeysIt2 keys2;
-  OffsetT keys2_count;
-  OffsetT* merge_partitions;
+  Offset keys2_count;
+  Offset* merge_partitions;
   CompareOp compare_op;
-  int items_per_tile;
-  OffsetT partition_idx;
+  Offset partition_idx;
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  // TODO(bgruber): I honestly would just inline this agent into the kernel, because it short and non-public
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
   {
-    const OffsetT partition_at      = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
-    const OffsetT partition_diag    = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
-    merge_partitions[partition_idx] = partition_diag;
+    const Offset partition_at       = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
+    merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
   }
 };
 
+template <int ThreadsPerBlock,
+          int ItemsPerThread,
+          BlockLoadAlgorithm LoadAlgorithm,
+          CacheLoadModifier LoadCacheModifier,
+          BlockStoreAlgorithm StoreAlgorithm>
+struct agent_merge_no_sort_policy
+{
+  static constexpr int threads_per_block = ThreadsPerBlock;
+  static constexpr int items_per_thread  = ItemsPerThread;
+  static constexpr int items_per_tile    = threads_per_block * items_per_thread;
+
+  // for BlockLoad
+  static constexpr int BLOCK_THREADS                   = threads_per_block;
+  static constexpr int ITEMS_PER_THREAD                = items_per_thread;
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = LoadAlgorithm;
+  static constexpr CacheLoadModifier LOAD_MODIFIER     = LoadCacheModifier;
+  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
+};
+
 // TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
 template <typename Policy,
           typename KeysIt1,
@@ -88,67 +78,65 @@ template <typename Policy,
           typename ItemsOutputIt,
           typename Offset,
           typename CompareOp>
-struct AgentMergeNoSort
+struct agent_merge_no_sort
 {
   using key_type  = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
   using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
 
-  static constexpr bool HaveItems = !std::is_same<item_type, NullType>::value;
-
-  using KeysLoadIt1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
-  using KeysLoadIt2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
-  using ItemsLoadIt1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
-  using ItemsLoadIt2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
+  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
+  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
+  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
+  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
 
-  using BlockLoadKeys1  = typename BlockLoadType<Policy, KeysLoadIt1>::type;
-  using BlockLoadKeys2  = typename BlockLoadType<Policy, KeysLoadIt2>::type;
-  using BlockLoadItems1 = typename BlockLoadType<Policy, ItemsLoadIt1>::type;
-  using BlockLoadItems2 = typename BlockLoadType<Policy, ItemsLoadIt2>::type;
+  using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
+  using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
+  using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
+  using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
 
-  using BlockStoreKeys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
-  using BlockStoreItems = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+  using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
+  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
 
-  union _TempStorage
+  union temp_storages
   {
-    typename BlockLoadKeys1::TempStorage load_keys1;
-    typename BlockLoadKeys2::TempStorage load_keys2;
-    typename BlockLoadItems1::TempStorage load_items1;
-    typename BlockLoadItems2::TempStorage load_items2;
-    typename BlockStoreKeys::TempStorage store_keys;
-    typename BlockStoreItems::TempStorage store_items;
-
-    item_type items_shared[Policy::ITEMS_PER_TILE + 1];
-    key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
+    typename block_load_keys1::TempStorage load_keys1;
+    typename block_load_keys2::TempStorage load_keys2;
+    typename block_load_items1::TempStorage load_items1;
+    typename block_load_items2::TempStorage load_items2;
+    typename block_store_keys::TempStorage store_keys;
+    typename block_store_items::TempStorage store_items;
+
+    item_type items_shared[Policy::items_per_tile + 1];
+    key_type keys_shared[Policy::items_per_tile + 1];
   };
 
-  struct TempStorage : Uninitialized<_TempStorage>
+  struct TempStorage : Uninitialized<temp_storages>
   {};
 
-  static constexpr int ITEMS_PER_THREAD  = Policy::ITEMS_PER_THREAD;
-  static constexpr int BLOCK_THREADS     = Policy::BLOCK_THREADS;
-  static constexpr Offset ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int items_per_thread  = Policy::items_per_thread;
+  static constexpr int threads_per_block = Policy::threads_per_block;
+  static constexpr Offset items_per_tile = Policy::items_per_tile;
 
   // Per thread data
-  _TempStorage& storage;
-  KeysLoadIt1 keys1_in;
-  ItemsLoadIt1 items1_in;
+  temp_storages& storage;
+  keys_load_it1 keys1_in;
+  items_load_it1 items1_in;
   Offset keys1_count;
-  KeysLoadIt2 keys2_in;
-  ItemsLoadIt2 items2_in;
+  keys_load_it2 keys2_in;
+  items_load_it2 items2_in;
   Offset keys2_count;
   KeysOutputIt keys_out;
   ItemsOutputIt items_out;
   CompareOp compare_op;
   Offset* merge_partitions;
 
-  template <bool IS_FULL_TILE>
+  template <bool IsFullTile>
   _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
   {
     const Offset partition_beg = merge_partitions[tile_idx + 0];
     const Offset partition_end = merge_partitions[tile_idx + 1];
 
-    const Offset diag0 = ITEMS_PER_TILE * tile_idx;
-    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
+    const Offset diag0 = items_per_tile * tile_idx;
+    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
 
     // compute bounding box for keys1 & keys2
     const Offset keys1_beg = partition_beg;
@@ -160,14 +148,15 @@ struct AgentMergeNoSort
     const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
     const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
-    key_type keys_loc[ITEMS_PER_THREAD];
-    gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
-    reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_loc);
+    key_type keys_loc[items_per_thread];
+    gmem_to_reg<threads_per_block, IsFullTile>(
+      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
+    reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
     CTA_SYNC();
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
-    const int diag0_loc = min<int>(num_keys1 + num_keys2, ITEMS_PER_THREAD * threadIdx.x);
+    const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
 
     const int keys1_beg_loc =
       MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
@@ -179,7 +168,7 @@ struct AgentMergeNoSort
     const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
 
     // perform serial merge
-    int indices[ITEMS_PER_THREAD];
+    int indices[items_per_thread];
     cub::SerialMerge(
       &storage.keys_shared[0],
       keys1_beg_loc,
@@ -192,62 +181,62 @@ struct AgentMergeNoSort
     CTA_SYNC();
 
     // write keys
-    if (IS_FULL_TILE)
+    if (IsFullTile)
     {
-      BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc);
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
     }
     else
     {
-      BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc, num_remaining);
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc, num_remaining);
     }
 
     // if items are provided, merge them
-    _CCCL_IF_CONSTEXPR (HaveItems)
+    static constexpr bool have_items = !std::is_same<item_type, NullType>::value;
+    _CCCL_IF_CONSTEXPR (have_items)
     {
-      item_type items_loc[ITEMS_PER_THREAD];
-      gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+      item_type items_loc[items_per_thread];
+      gmem_to_reg<threads_per_block, IsFullTile>(
         items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
       CTA_SYNC(); // TODO(bgruber): there is no sync when loading keys. bug or unnecessary?
-      reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_loc);
+      reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
       CTA_SYNC();
 
       // gather items from shared mem
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      for (int i = 0; i < items_per_thread; ++i)
       {
-        items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+        items_loc[i] = storage.items_shared[indices[i]];
       }
       CTA_SYNC();
 
       // write from reg to gmem
-      if (IS_FULL_TILE)
+      if (IsFullTile)
       {
-        BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc);
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
       }
       else
       {
-        BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc, num_remaining);
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc, num_remaining);
       }
     }
   }
 
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
   {
     // XXX with 8.5 chaging type to Offset (or long long) results in error!
+    // TODO(bgruber): is the above still true?
     const int tile_idx     = static_cast<int>(blockIdx.x);
-    const Offset tile_base = tile_idx * ITEMS_PER_TILE;
+    const Offset tile_base = tile_idx * items_per_tile;
     // TODO(bgruber): random mixing of int and Offset
     const int items_in_tile =
-      static_cast<int>(cub::min(static_cast<Offset>(ITEMS_PER_TILE), keys1_count + keys2_count - tile_base));
-    if (items_in_tile == ITEMS_PER_TILE)
+      static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
+    if (items_in_tile == items_per_tile)
     {
-      // full tile
-      consume_tile<true>(tile_idx, tile_base, ITEMS_PER_TILE);
+      consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
     }
     else
     {
-      // partial tile
-      consume_tile<false>(tile_idx, tile_base, items_in_tile);
+      consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
     }
   }
 };