Skip to content

Commit

Permalink
Merge branch 'main' into merge-sort-dynamic-cub-dispatch
Browse files Browse the repository at this point in the history
  • Loading branch information
NaderAlAwar authored Jan 29, 2025
2 parents 4cbee12 + d21e0c9 commit a3602cc
Show file tree
Hide file tree
Showing 268 changed files with 59,109 additions and 3,628 deletions.
19 changes: 10 additions & 9 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ workflows:
- {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
- {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
# Current CTK testing:
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
# Disabled until we figure out the issue with the TBB dll
#- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
# Split up cub tests:
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
- {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
Expand Down Expand Up @@ -256,13 +257,13 @@ projects:

# testing -> Runner with GPU is in a nv-gh-runners testing pool
gpus:
v100: { sm: 70 } # 32 GB, 40 runners
t4: { sm: 75, testing: true } # 16 GB, 8 runners
rtx2080: { sm: 75, testing: true } # 8 GB, 8 runners
rtxa6000: { sm: 86, testing: true } # 48 GB, 12 runners
l4: { sm: 89, testing: true } # 24 GB, 48 runners
rtx4090: { sm: 89, testing: true } # 24 GB, 10 runners
h100: { sm: 90, testing: true } # 80 GB, 16 runners
v100: { sm: 70 } # 32 GB, 40 runners
t4: { sm: 75 } # 16 GB, 10 runners
rtx2080: { sm: 75 } # 8 GB, 12 runners
rtxa6000: { sm: 86 } # 48 GB, 12 runners
l4: { sm: 89 } # 24 GB, 48 runners
rtx4090: { sm: 89 } # 24 GB, 10 runners
h100: { sm: 90 } # 80 GB, 16 runners

# Tags are used to define a `matrix job` in the workflow section.
#
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/adjacent_difference/subtract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#if !TUNE_BASE
struct policy_hub_t
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
using AdjacentDifferencePolicy =
cub::AgentAdjacentDifferencePolicy<TUNE_THREADS_PER_BLOCK,
Expand All @@ -45,7 +45,7 @@ struct policy_hub_t
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ using block_delay_constructor_t =

struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
{
using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy<
TUNE_THREADS,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/histogram/histogram_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
{
static constexpr cub::BlockLoadAlgorithm load_algorithm =
(TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -122,20 +120,12 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
20 changes: 5 additions & 15 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,6 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
Expand All @@ -148,20 +146,12 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
equality_op_t,
offset_t,
keep_rejects,
may_alias,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
may_alias
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // !TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/three_way.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
template <typename InputT>
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t>
{
using ThreeWayPartitionPolicy = //
cub::AgentThreeWayPartitionPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

struct reduce_by_key_policy_hub
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
using ReduceByKeyPolicyT =
cub::AgentReduceByKeyPolicy<TUNE_THREADS,
Expand All @@ -64,7 +64,7 @@ struct reduce_by_key_policy_hub
delay_constructor_t>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

struct reduce_by_key_policy_hub
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
using ReduceByKeyPolicyT =
cub::AgentReduceByKeyPolicy<TUNE_THREADS,
Expand All @@ -66,7 +66,7 @@ struct reduce_by_key_policy_hub
delay_constructor_t>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

struct device_rle_policy_hub
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
using RleSweepPolicyT =
cub::AgentRlePolicy<TUNE_THREADS,
Expand All @@ -66,7 +66,7 @@ struct device_rle_policy_hub
delay_constructor_t>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/segmented_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ struct device_seg_sort_policy_hub
{
using DominantT = KeyT;

struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
static constexpr int BLOCK_THREADS = TUNE_THREADS;
static constexpr int RADIX_BITS = TUNE_RADIX_BITS;
Expand Down Expand Up @@ -143,7 +143,7 @@ struct device_seg_sort_policy_hub
TUNE_M_LOAD_MODIFIER>>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/select/unique_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

struct policy_hub
{
struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
{
using UniqueByKeyPolicyT =
cub::AgentUniqueByKeyPolicy<TUNE_THREADS,
Expand All @@ -64,7 +64,7 @@ struct policy_hub
delay_constructor_t>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
#endif // !TUNE_BASE

Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tupl
#else
struct policy_hub_t
{
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
{
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
Expand Down
50 changes: 3 additions & 47 deletions cub/cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -383,40 +383,6 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
// Tuning policies
//---------------------------------------------------------------------

/// SM35
struct Policy350
{
using SpmvPolicyT =
AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128,
(sizeof(ValueT) > 4) ? 4 : 7,
LOAD_LDG,
LOAD_CA,
LOAD_LDG,
LOAD_LDG,
LOAD_LDG,
(sizeof(ValueT) > 4) ? true : false,
BLOCK_SCAN_WARP_SCANS>;

using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
};

/// SM37
struct Policy370
{
using SpmvPolicyT =
AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128,
(sizeof(ValueT) > 4) ? 9 : 14,
LOAD_LDG,
LOAD_CA,
LOAD_LDG,
LOAD_LDG,
LOAD_LDG,
false,
BLOCK_SCAN_WARP_SCANS>;

using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
};

/// SM50
struct Policy500
{
Expand Down Expand Up @@ -459,15 +425,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
#if (CUB_PTX_ARCH >= 600)
using PtxPolicy = Policy600;

#elif (CUB_PTX_ARCH >= 500)
using PtxPolicy = Policy500;

#elif (CUB_PTX_ARCH >= 370)
using PtxPolicy = Policy370;

#else
using PtxPolicy = Policy350;

using PtxPolicy = Policy500;
#endif

// "Opaque" policies (whose parameterizations aren't reflected in the type signature)
Expand Down Expand Up @@ -502,12 +461,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
} else if (ptx_version >= 500) {
spmv_config.template Init<typename Policy500::SpmvPolicyT>();
segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
} else if (ptx_version >= 370) {
spmv_config.template Init<typename Policy370::SpmvPolicyT>();
segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
} else {
spmv_config.template Init<typename Policy350::SpmvPolicyT>();
segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
spmv_config.template Init<typename Policy500::SpmvPolicyT>();
segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
}));
}

Expand Down
14 changes: 2 additions & 12 deletions cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,7 @@ struct policy_hub
{
using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;

struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
{
using AdjacentDifferencePolicy =
AgentAdjacentDifferencePolicy<128,
Nominal8BItemsToItems<ValueT>(7),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
{
using AdjacentDifferencePolicy =
AgentAdjacentDifferencePolicy<128,
Expand All @@ -72,7 +62,7 @@ struct policy_hub
BLOCK_STORE_WARP_TRANSPOSE>;
};

using MaxPolicy = Policy350;
using MaxPolicy = Policy500;
};
} // namespace adjacent_difference
} // namespace detail
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ struct policy_hub
using buff_delay_constructor_t = detail::default_delay_constructor_t<BufferOffsetT>;
using block_delay_constructor_t = detail::default_delay_constructor_t<BlockOffsetT>;

/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
/// SM50
struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
{
static constexpr bool PREFER_POW2_BITS = true;
using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy<
Expand All @@ -95,7 +95,7 @@ struct policy_hub
};

/// SM70
struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
struct Policy700 : ChainedPolicy<700, Policy700, Policy500>
{
static constexpr bool PREFER_POW2_BITS = false;
using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy<
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/tuning/tuning_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ namespace for_each

struct policy_hub_t
{
struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t>
struct policy_500_t : ChainedPolicy<500, policy_500_t, policy_500_t>
{
using for_policy_t = policy_t<256, 2>;
};

using MaxPolicy = policy_350_t;
using MaxPolicy = policy_500_t;
};

} // namespace for_each
Expand Down
9 changes: 1 addition & 8 deletions cub/cub/device/dispatch/tuning/tuning_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,8 @@ struct policy_hub
return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
}

// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
// TODO This might be worth it to separate usual histogram and the multi one
using AgentHistogramPolicyT = AgentHistogramPolicy<128, t_scale(8), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>;
};

// SM50
struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
{
// TODO This might be worth it to separate usual histogram and the multi one
using AgentHistogramPolicyT =
Expand Down
Loading

0 comments on commit a3602cc

Please sign in to comment.