Merge branch 'main' into review-cub-util-ptx

NVIDIA · Jan 13, 2025 · 8d44adb · 8d44adb
2 parents 0af70fd + c339a52
commit 8d44adb
Show file tree

Hide file tree

Showing 80 changed files with 694 additions and 1,808 deletions.
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
@@ -52,8 +52,6 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -162,15 +160,15 @@ struct AgentScan
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
                      CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
                      InputIteratorT>;
 
   // Constants
   enum
   {
     // Inclusive scan if no init_value type is provided
-    HAS_INIT     = !std::is_same<InitValueT, NullType>::value,
+    HAS_INIT     = !::cuda::std::is_same<InitValueT, NullType>::value,
     IS_INCLUSIVE = ForceInclusive || !HAS_INIT, // We are relying on either initial value not being `NullType`
                                                 // or the ForceInclusive tag to be true for inclusive scan
                                                 // to get picked up.

diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -53,8 +53,6 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 #include <nv/target>
 
 CUB_NAMESPACE_BEGIN

diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -85,12 +85,9 @@
 #  endif // CUB_RUNTIME_FUNCTION predefined
 
 #  ifdef CUB_RDC_ENABLED
-// Detect available version of CDP:
-#    if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
-#      define CUB_DETAIL_CDPv1
-#    else
-#      define CUB_DETAIL_CDPv2
-#    endif
+#    ifdef CUDA_FORCE_CDP1_IF_SUPPORTED
+#      error "CUDA Dynamic Parallelism 1 is no longer supported. Please undefine CUDA_FORCE_CDP1_IF_SUPPORTED."
+#    endif // CUDA_FORCE_CDP1_IF_SUPPORTED
 #  endif
 
 #endif // Do not document
diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh
@@ -45,27 +45,7 @@ _CCCL_EXEC_CHECK_DISABLE
 CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
 {
   cudaError_t result = cudaErrorNotSupported;
-
-  // Device-side sync is only available under CDPv1:
-#if defined(CUB_DETAIL_CDPv1)
-
-#  if ((__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
-  // CUDA >= 11.6
-#    define CUB_TMP_DEVICE_SYNC_IMPL result = __cudaDeviceSynchronizeDeprecationAvoidance();
-#  else // CUDA < 11.6:
-#    define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
-#  endif
-
-#else // CDPv2 or no CDP:
-
-#  define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
-
-#endif // CDP version
-
-  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL));
-
-#undef CUB_TMP_DEVICE_SYNC_IMPL
-
+  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ());
   return result;
 }
 

diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh
@@ -45,8 +45,6 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <iterator>
-
 CUB_NAMESPACE_BEGIN
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document

diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
@@ -129,10 +129,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
@@ -374,10 +374,10 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -402,10 +402,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -528,10 +528,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -760,9 +760,8 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
 
-    return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
@@ -860,9 +859,8 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
 
-    return SortKeysNoNVTX<KeyIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortKeysNoNVTX<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
@@ -974,8 +972,7 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
-    return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
 };

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -37,6 +37,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/util_namespace.cuh>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -46,6 +48,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_scan.cuh>
+#include <cub/device/dispatch/kernels/scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_scan.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -57,149 +60,8 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 CUB_NAMESPACE_BEGIN
 
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * @brief Initialization kernel for tile status initialization (multi-block)
- *
- * @tparam ScanTileStateT
- *   Tile status interface type
- *
- * @param[in] tile_state
- *   Tile status interface
- *
- * @param[in] num_tiles
- *   Number of tiles
- */
-template <typename ScanTileStateT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
-{
-  // Initialize tile status
-  tile_state.InitializeStatus(num_tiles);
-}
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- *
- * @tparam ScanTileStateT
- *   Tile status interface type
- *
- * @tparam NumSelectedIteratorT
- *   Output iterator type for recording the number of items selected
- *
- * @param[in] tile_state
- *   Tile status interface
- *
- * @param[in] num_tiles
- *   Number of tiles
- *
- * @param[out] d_num_selected_out
- *   Pointer to the total number of items selected
- *   (i.e., length of `d_selected_out`)
- */
-template <typename ScanTileStateT, typename NumSelectedIteratorT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
-{
-  // Initialize tile status
-  tile_state.InitializeStatus(num_tiles);
-
-  // Initialize d_num_selected_out
-  if ((blockIdx.x == 0) && (threadIdx.x == 0))
-  {
-    *d_num_selected_out = 0;
-  }
-}
-
-/**
- * @brief Scan kernel entry point (multi-block)
- *
- *
- * @tparam ChainedPolicyT
- *   Chained tuning policy
- *
- * @tparam InputIteratorT
- *   Random-access input iterator type for reading scan inputs @iterator
- *
- * @tparam OutputIteratorT
- *   Random-access output iterator type for writing scan outputs @iterator
- *
- * @tparam ScanTileStateT
- *   Tile status interface type
- *
- * @tparam ScanOpT
- *   Binary scan functor type having member
- *   `auto operator()(const T &a, const U &b)`
- *
- * @tparam InitValueT
- *   Initial value to seed the exclusive scan
- *   (cub::NullType for inclusive scans)
- *
- * @tparam OffsetT
- *   Unsigned integer type for global offsets
- *
- * @paramInput d_in
- *   data
- *
- * @paramOutput d_out
- *   data
- *
- * @paramTile tile_state
- *   status interface
- *
- * @paramThe start_tile
- *   starting tile for the current grid
- *
- * @paramBinary scan_op
- *   scan functor
- *
- * @paramInitial init_value
- *   value to seed the exclusive scan
- *
- * @paramTotal num_items
- *   number of scan items for the entire problem
- */
-template <typename ChainedPolicyT,
-          typename InputIteratorT,
-          typename OutputIteratorT,
-          typename ScanTileStateT,
-          typename ScanOpT,
-          typename InitValueT,
-          typename OffsetT,
-          typename AccumT,
-          bool ForceInclusive>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ScanTileStateT tile_state,
-    int start_tile,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items)
-{
-  using RealInitValueT = typename InitValueT::value_type;
-  using ScanPolicyT    = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
-
-  // Thread block type for scanning input tiles
-  using AgentScanT =
-    AgentScan<ScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, RealInitValueT, OffsetT, AccumT, ForceInclusive>;
-
-  // Shared memory for AgentScan
-  __shared__ typename AgentScanT::TempStorage temp_storage;
-
-  RealInitValueT real_init_value = init_value;
-
-  // Process tiles
-  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
-}
-
 /******************************************************************************
  * Dispatch
  ******************************************************************************/