Merge branch 'main' into merge-sort-dynamic-cub-dispatch

NVIDIA · Jan 29, 2025 · a3602cc · a3602cc
2 parents 4cbee12 + d21e0c9
commit a3602cc
Show file tree

Hide file tree

Showing 268 changed files with 59,109 additions and 3,628 deletions.
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -19,8 +19,9 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
+    # Disabled until we figure out the issue with the TBB dll
+    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
     # Split up cub tests:
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
     - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
@@ -256,13 +257,13 @@ projects:
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
-  v100:     { sm: 70 }                # 32 GB,  40 runners
-  t4:       { sm: 75, testing: true } # 16 GB,   8 runners
-  rtx2080:  { sm: 75, testing: true } #  8 GB,   8 runners
-  rtxa6000: { sm: 86, testing: true } # 48 GB,  12 runners
-  l4:       { sm: 89, testing: true } # 24 GB,  48 runners
-  rtx4090:  { sm: 89, testing: true } # 24 GB,  10 runners
-  h100:     { sm: 90, testing: true } # 80 GB,  16 runners
+  v100:     { sm: 70 } # 32 GB,  40 runners
+  t4:       { sm: 75 } # 16 GB,  10 runners
+  rtx2080:  { sm: 75 } #  8 GB,  12 runners
+  rtxa6000: { sm: 86 } # 48 GB,  12 runners
+  l4:       { sm: 89 } # 24 GB,  48 runners
+  rtx4090:  { sm: 89 } # 24 GB,  10 runners
+  h100:     { sm: 90 } # 80 GB,  16 runners
 
 # Tags are used to define a `matrix job` in the workflow section.
 #

diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
@@ -35,7 +35,7 @@
 #if !TUNE_BASE
 struct policy_hub_t
 {
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     using AdjacentDifferencePolicy =
       cub::AgentAdjacentDifferencePolicy<TUNE_THREADS_PER_BLOCK,
@@ -45,7 +45,7 @@ struct policy_hub_t
                                          cub::BLOCK_STORE_WARP_TRANSPOSE>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu
@@ -118,7 +118,7 @@ using block_delay_constructor_t =
 
 struct policy_hub_t
 {
-  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
   {
     using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy<
       TUNE_THREADS,

diff --git a/cub/benchmarks/bench/histogram/histogram_common.cuh b/cub/benchmarks/bench/histogram/histogram_common.cuh
@@ -62,7 +62,7 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
 template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
 struct policy_hub_t
 {
-  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
   {
     static constexpr cub::BlockLoadAlgorithm load_algorithm =
       (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)

diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
@@ -111,8 +111,6 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   using output_it_t                          = typename ::cuda::std::
     conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<T>;
   using dispatch_t = cub::DispatchSelectIf<
     input_it_t,
     flag_it_t,
@@ -122,20 +120,12 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
     equality_op_t,
     offset_t,
     keep_rejects,
-    may_alias,
-    policy_t>;
-#else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+    may_alias
+#if !TUNE_BASE
+    ,
+    policy_hub_t<T>
 #endif // TUNE_BASE
+    >;
 
   // Retrieve axis parameters
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));

diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
@@ -137,8 +137,6 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
   using output_it_t                          = typename ::cuda::std::
     conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<T>;
   using dispatch_t = cub::DispatchSelectIf<
     input_it_t,
     flag_it_t,
@@ -148,20 +146,12 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
     equality_op_t,
     offset_t,
     keep_rejects,
-    may_alias,
-    policy_t>;
-#else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+    may_alias
+#if !TUNE_BASE
+    ,
+    policy_hub_t<T>
 #endif // !TUNE_BASE
+    >;
 
   // Retrieve axis parameters
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));

diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu
@@ -47,7 +47,7 @@
 template <typename InputT>
 struct policy_hub_t
 {
-  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
   {
     using ThreeWayPartitionPolicy = //
       cub::AgentThreeWayPartitionPolicy<TUNE_THREADS_PER_BLOCK,

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
@@ -53,7 +53,7 @@
 
 struct reduce_by_key_policy_hub
 {
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     using ReduceByKeyPolicyT =
       cub::AgentReduceByKeyPolicy<TUNE_THREADS,
@@ -64,7 +64,7 @@ struct reduce_by_key_policy_hub
                                   delay_constructor_t>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -55,7 +55,7 @@
 
 struct reduce_by_key_policy_hub
 {
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     using ReduceByKeyPolicyT =
       cub::AgentReduceByKeyPolicy<TUNE_THREADS,
@@ -66,7 +66,7 @@ struct reduce_by_key_policy_hub
                                   delay_constructor_t>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
@@ -54,7 +54,7 @@
 
 struct device_rle_policy_hub
 {
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     using RleSweepPolicyT =
       cub::AgentRlePolicy<TUNE_THREADS,
@@ -66,7 +66,7 @@ struct device_rle_policy_hub
                           delay_constructor_t>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu
@@ -109,7 +109,7 @@ struct device_seg_sort_policy_hub
 {
   using DominantT = KeyT;
 
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     static constexpr int BLOCK_THREADS          = TUNE_THREADS;
     static constexpr int RADIX_BITS             = TUNE_RADIX_BITS;
@@ -143,7 +143,7 @@ struct device_seg_sort_policy_hub
                                        TUNE_M_LOAD_MODIFIER>>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu
@@ -53,7 +53,7 @@
 
 struct policy_hub
 {
-  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
   {
     using UniqueByKeyPolicyT =
       cub::AgentUniqueByKeyPolicy<TUNE_THREADS,
@@ -64,7 +64,7 @@ struct policy_hub
                                   delay_constructor_t>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 #endif // !TUNE_BASE
 

diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
@@ -31,7 +31,7 @@ using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tupl
 #else
 struct policy_hub_t
 {
-  struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
+  struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
   {
     static constexpr int min_bif    = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
     static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);

diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -383,40 +383,6 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
   // Tuning policies
   //---------------------------------------------------------------------
 
-  /// SM35
-  struct Policy350
-  {
-    using SpmvPolicyT =
-      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128,
-                      (sizeof(ValueT) > 4) ? 4 : 7,
-                      LOAD_LDG,
-                      LOAD_CA,
-                      LOAD_LDG,
-                      LOAD_LDG,
-                      LOAD_LDG,
-                      (sizeof(ValueT) > 4) ? true : false,
-                      BLOCK_SCAN_WARP_SCANS>;
-
-    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
-  };
-
-  /// SM37
-  struct Policy370
-  {
-    using SpmvPolicyT =
-      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128,
-                      (sizeof(ValueT) > 4) ? 9 : 14,
-                      LOAD_LDG,
-                      LOAD_CA,
-                      LOAD_LDG,
-                      LOAD_LDG,
-                      LOAD_LDG,
-                      false,
-                      BLOCK_SCAN_WARP_SCANS>;
-
-    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
-  };
-
   /// SM50
   struct Policy500
   {
@@ -459,15 +425,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
 #if (CUB_PTX_ARCH >= 600)
   using PtxPolicy = Policy600;
 
-#elif (CUB_PTX_ARCH >= 500)
-  using PtxPolicy = Policy500;
-
-#elif (CUB_PTX_ARCH >= 370)
-  using PtxPolicy = Policy370;
-
 #else
-  using PtxPolicy = Policy350;
-
+  using PtxPolicy = Policy500;
 #endif
 
   // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
@@ -502,12 +461,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
         } else if (ptx_version >= 500) {
           spmv_config.template Init<typename Policy500::SpmvPolicyT>();
           segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
-        } else if (ptx_version >= 370) {
-          spmv_config.template Init<typename Policy370::SpmvPolicyT>();
-          segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
         } else {
-          spmv_config.template Init<typename Policy350::SpmvPolicyT>();
-          segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
+          spmv_config.template Init<typename Policy500::SpmvPolicyT>();
+          segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
         }));
   }
 

diff --git a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh
@@ -52,17 +52,7 @@ struct policy_hub
 {
   using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;
 
-  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
-  {
-    using AdjacentDifferencePolicy =
-      AgentAdjacentDifferencePolicy<128,
-                                    Nominal8BItemsToItems<ValueT>(7),
-                                    BLOCK_LOAD_WARP_TRANSPOSE,
-                                    LOAD_DEFAULT,
-                                    BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
   {
     using AdjacentDifferencePolicy =
       AgentAdjacentDifferencePolicy<128,
@@ -72,7 +62,7 @@ struct policy_hub
                                     BLOCK_STORE_WARP_TRANSPOSE>;
   };
 
-  using MaxPolicy = Policy350;
+  using MaxPolicy = Policy500;
 };
 } // namespace adjacent_difference
 } // namespace detail

diff --git a/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh b/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh
@@ -75,8 +75,8 @@ struct policy_hub
   using buff_delay_constructor_t  = detail::default_delay_constructor_t<BufferOffsetT>;
   using block_delay_constructor_t = detail::default_delay_constructor_t<BlockOffsetT>;
 
-  /// SM35
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  /// SM50
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
   {
     static constexpr bool PREFER_POW2_BITS = true;
     using AgentSmallBufferPolicyT          = AgentBatchMemcpyPolicy<
@@ -95,7 +95,7 @@ struct policy_hub
   };
 
   /// SM70
-  struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
+  struct Policy700 : ChainedPolicy<700, Policy700, Policy500>
   {
     static constexpr bool PREFER_POW2_BITS = false;
     using AgentSmallBufferPolicyT          = AgentBatchMemcpyPolicy<

diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh
@@ -49,12 +49,12 @@ namespace for_each
 
 struct policy_hub_t
 {
-  struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t>
+  struct policy_500_t : ChainedPolicy<500, policy_500_t, policy_500_t>
   {
     using for_policy_t = policy_t<256, 2>;
   };
 
-  using MaxPolicy = policy_350_t;
+  using MaxPolicy = policy_500_t;
 };
 
 } // namespace for_each

diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -136,15 +136,8 @@ struct policy_hub
     return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
   }
 
-  // SM35
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
-  {
-    // TODO This might be worth it to separate usual histogram and the multi one
-    using AgentHistogramPolicyT = AgentHistogramPolicy<128, t_scale(8), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>;
-  };
-
   // SM50
-  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
   {
     // TODO This might be worth it to separate usual histogram and the multi one
     using AgentHistogramPolicyT =