Merge branch 'main' into round-up-down

NVIDIA · Jan 14, 2025 · 893a413 · 893a413
2 parents 8fdb01d + 08420d4
commit 893a413
Show file tree

Hide file tree

Showing 130 changed files with 1,924 additions and 2,576 deletions.
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
@@ -52,8 +52,6 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -162,15 +160,15 @@ struct AgentScan
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
                      CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
                      InputIteratorT>;
 
   // Constants
   enum
   {
     // Inclusive scan if no init_value type is provided
-    HAS_INIT     = !std::is_same<InitValueT, NullType>::value,
+    HAS_INIT     = !::cuda::std::is_same<InitValueT, NullType>::value,
     IS_INCLUSIVE = ForceInclusive || !HAS_INIT, // We are relying on either initial value not being `NullType`
                                                 // or the ForceInclusive tag to be true for inclusive scan
                                                 // to get picked up.

diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
@@ -102,7 +102,7 @@ template <int _BLOCK_THREADS,
           CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
           bool _DIRECT_LOAD_NONZEROS,
           BlockScanAlgorithm _SCAN_ALGORITHM>
-struct AgentSpmvPolicy
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy
 {
   enum
   {
@@ -148,7 +148,12 @@ struct AgentSpmvPolicy
  *   Signed integer type for sequence offsets
  */
 template <typename ValueT, typename OffsetT>
-struct SpmvParams
+struct
+// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/
+#if !_CCCL_COMPILER(NVHPC)
+  CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
+#endif
+    SpmvParams
 {
   /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix
   /// <b>A</b>.
@@ -211,7 +216,7 @@ template <typename AgentSpmvPolicyT,
           bool HAS_ALPHA,
           bool HAS_BETA,
           int LEGACY_PTX_ARCH = 0>
-struct AgentSpmv
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
 {
   //---------------------------------------------------------------------
   // Types and constants
@@ -308,7 +313,9 @@ struct AgentSpmv
   /// Reference to temp_storage
   _TempStorage& temp_storage;
 
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   SpmvParams<ValueT, OffsetT>& spmv_params;
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements
   /// of matrix <b>A</b>.
@@ -341,6 +348,7 @@ struct AgentSpmv
    * @param spmv_params
    *   SpMV input parameter bundle
    */
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams<ValueT, OffsetT>& spmv_params)
       : temp_storage(temp_storage.Alias())
       , spmv_params(spmv_params)
@@ -350,6 +358,7 @@ struct AgentSpmv
       , wd_vector_x(spmv_params.d_vector_x)
       , wd_vector_y(spmv_params.d_vector_y)
   {}
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   /**
    * @brief Consume a merge tile, specialized for direct-load of nonzeros

diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -53,8 +53,6 @@
 
 #include <cuda/std/type_traits>
 
-#include <iterator>
-
 #include <nv/target>
 
 CUB_NAMESPACE_BEGIN

diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -85,12 +85,9 @@
 #  endif // CUB_RUNTIME_FUNCTION predefined
 
 #  ifdef CUB_RDC_ENABLED
-// Detect available version of CDP:
-#    if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
-#      define CUB_DETAIL_CDPv1
-#    else
-#      define CUB_DETAIL_CDPv2
-#    endif
+#    ifdef CUDA_FORCE_CDP1_IF_SUPPORTED
+#      error "CUDA Dynamic Parallelism 1 is no longer supported. Please undefine CUDA_FORCE_CDP1_IF_SUPPORTED."
+#    endif // CUDA_FORCE_CDP1_IF_SUPPORTED
 #  endif
 
 #endif // Do not document
diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh
@@ -45,27 +45,7 @@ _CCCL_EXEC_CHECK_DISABLE
 CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
 {
   cudaError_t result = cudaErrorNotSupported;
-
-  // Device-side sync is only available under CDPv1:
-#if defined(CUB_DETAIL_CDPv1)
-
-#  if ((__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
-  // CUDA >= 11.6
-#    define CUB_TMP_DEVICE_SYNC_IMPL result = __cudaDeviceSynchronizeDeprecationAvoidance();
-#  else // CUDA < 11.6:
-#    define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
-#  endif
-
-#else // CDPv2 or no CDP:
-
-#  define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
-
-#endif // CDP version
-
-  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL));
-
-#undef CUB_TMP_DEVICE_SYNC_IMPL
-
+  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ());
   return result;
 }
 

diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh
@@ -45,8 +45,6 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <iterator>
-
 CUB_NAMESPACE_BEGIN
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document

diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
@@ -129,10 +129,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
@@ -374,10 +374,10 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -402,10 +402,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -528,10 +528,10 @@ private:
     CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    using ChooseOffsetT = detail::choose_offset_t<OffsetT>;
 
     using DispatchMergeSortT =
-      DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+      DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;
 
     return DispatchMergeSortT::Dispatch(
       d_temp_storage,
@@ -760,9 +760,8 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
 
-    return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
@@ -860,9 +859,8 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
 
-    return SortKeysNoNVTX<KeyIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortKeysNoNVTX<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
@@ -974,8 +972,7 @@ public:
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
-    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
-    return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, PromotedOffsetT, CompareOpT>(
+    return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
 };