Skip to content

Commit

Permalink
Add b200 tunings for scan.exclusive.sum (NVIDIA#3559)
Browse files Browse the repository at this point in the history
* Drop unused struct
* Refactor
* Clarify input type in scan benchmark
* Redesign scan policy selection after discussion with Georgii

Co-authored-by: Giannis Gonidelis <[email protected]>
Co-authored-by: Georgii Evtushenko <[email protected]>
  • Loading branch information
3 people authored and elstehle committed Feb 7, 2025
1 parent 87b3dae commit 071ddc9
Show file tree
Hide file tree
Showing 7 changed files with 271 additions and 41 deletions.
19 changes: 10 additions & 9 deletions cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,18 @@ struct policy_hub_t
template <typename T, typename OffsetT>
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = cub::detail::InputValue<T>;
using accum_t = ::cuda::std::__accumulator_t<op_t, T, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using init_t = T;
using wrapped_init_t = cub::detail::InputValue<init_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, init_t, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t, policy_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, policy_t>;
#else
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand All @@ -114,7 +115,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)

size_t tmp_size;
dispatch_t::Dispatch(
nullptr, tmp_size, d_input, d_output, op_t{}, init_t{T{}}, static_cast<int>(input.size()), 0 /* stream */);
nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{T{}}, static_cast<int>(input.size()), 0 /* stream */);

thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
Expand All @@ -126,7 +127,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
d_input,
d_output,
op_t{},
init_t{T{}},
wrapped_init_t{T{}},
static_cast<int>(input.size()),
launch.get_stream());
});
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1321,7 +1321,8 @@ struct DeviceScan
detail::InputValue<InitValueT>,
OffsetT,
AccumT,
detail::scan::policy_hub<AccumT, ScanOpT>,
detail::scan::
policy_hub<detail::value_t<InputIteratorT>, detail::value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
ForceInclusive>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
Expand Down
13 changes: 7 additions & 6 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,13 @@ template <typename InputIteratorT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename PolicyHub = detail::scan::policy_hub<AccumT, ScanOpT>,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename PolicyHub = detail::scan::
policy_hub<detail::value_t<InputIteratorT>, detail::value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
bool ForceInclusive = false>
struct DispatchScan
{
Expand Down
219 changes: 216 additions & 3 deletions cub/cub/device/dispatch/tuning/tuning_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>

#include <cuda/functional>
#include <cuda/std/functional>

CUB_NAMESPACE_BEGIN
Expand All @@ -68,12 +69,26 @@ enum class primitive_op
no,
yes
};
enum class op_type
{
plus,
unknown
};
enum class offset_size
{
_4,
_8,
unknown
};
enum class value_size
{
_1,
_2,
_4,
_8,
_16,
unknown
};
enum class accum_size
{
_1,
Expand All @@ -96,6 +111,36 @@ constexpr primitive_op is_primitive_op()
return basic_binary_op_t<ScanOpT>::value ? primitive_op::yes : primitive_op::no;
}

template <typename Op>
struct is_plus
{
static constexpr bool value = false;
};

template <typename T>
struct is_plus<::cuda::std::plus<T>>
{
static constexpr bool value = true;
};

template <class ScanOpT>
constexpr op_type classify_op()
{
return is_plus<ScanOpT>::value ? op_type::plus : op_type::unknown;
}

template <class ValueT>
constexpr value_size classify_value_size()
{
return sizeof(ValueT) == 1 ? value_size::_1
: sizeof(ValueT) == 2 ? value_size::_2
: sizeof(ValueT) == 4 ? value_size::_4
: sizeof(ValueT) == 8 ? value_size::_8
: sizeof(ValueT) == 16
? value_size::_16
: value_size::unknown;
}

template <class AccumT>
constexpr accum_size classify_accum_size()
{
Expand All @@ -108,6 +153,12 @@ constexpr accum_size classify_accum_size()
: accum_size::unknown;
}

template <class OffsetT>
constexpr offset_size classify_offset_size()
{
return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
}

template <class AccumT, int Threads, int Items, int L2B, int L2W>
struct tuning
{
Expand Down Expand Up @@ -227,7 +278,138 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si
#endif
// clang-format on

template <typename AccumT, typename ScanOpT>
template <class ValueT,
class AccumT,
class OffsetT,
op_type OpTypeT,
primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
offset_size OffsetSize = classify_offset_size<OffsetT>(),
value_size ValueSize = classify_value_size<ValueT>()>
struct sm100_tuning;

// sum
template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_1>
{
// ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818 1.005682 1.173041 1.305288
static constexpr int items = 18;
static constexpr int threads = 512;
using delay_constructor = exponential_backon_constructor_t<768, 820>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_1>
{
// ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210 1.000000 1.100637 1.307692
static constexpr int items = 14;
static constexpr int threads = 384;
using delay_constructor = exponential_backon_constructor_t<228, 775>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_2>
{
// ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443 1.002841 1.119688 1.307692
static constexpr int items = 13;
static constexpr int threads = 512;
using delay_constructor = exponential_backon_constructor_t<1384, 720>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

// todo(gonidelis): Regresses for large inputs. Find better tuning.
// template <class ValueT, class AccumT, class OffsetT>
// struct sm100_tuning<ValueT,
// AccumT,
// OffsetT,
// op_type::plus,
// primitive_value::yes,
// primitive_accum::yes,
// offset_size::_8,
// value_size::_2>
// {
// // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934 0.983509 1.077724 1.305288
// static constexpr int items = 13;
// static constexpr int threads = 288;
// using delay_constructor = exponential_backon_jitter_window_constructor_t<1520, 895>;
// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
// static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
// static constexpr CacheLoadModifier load_modifier = LOAD_CA;
// };

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_4>
{
// ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442 0.997167 1.139902 1.462651
static constexpr int items = 22;
static constexpr int threads = 384;
using delay_constructor = exponential_backon_jitter_constructor_t<1904, 830>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_4>
{
// ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142 0.994350 1.137459 1.455636
static constexpr int items = 19;
static constexpr int threads = 416;
using delay_constructor = exponential_backon_constructor_t<956, 550>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_8>
{
// ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468 1.015581 1.085630 1.264583
static constexpr int items = 23;
static constexpr int threads = 416;
using delay_constructor = exponential_backon_jitter_window_constructor_t<772, 710>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_8>
{
// ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133 1.000000 1.075577 1.248963
static constexpr int items = 22;
static constexpr int threads = 320;
using delay_constructor = exponential_backoff_constructor_t<328, 965>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

// todo(gonidelis): Add tunings for i128, float and double.
// template <class OffsetT> struct sm100_tuning<float, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8,
// accum_size::_4>;
// Default explicitly so it doesn't pick up the sm100<I64, I64> tuning.
template <class AccumT, class OffsetT>
struct sm100_tuning<double, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_8>
: sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8>
{};

#if CUB_IS_INT128_ENABLED
// template <class OffsetT> struct sm100_tuning<__int128_t, OffsetT, op_type::plus, primitive_accum::no,
// offset_size::_8, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <class OffsetT> struct
// sm100_tuning<__uint128_t, OffsetT, op_type::plus, primitive_accum::no, offset_size::_8, accum_size::_16>
// : sm100_tuning<__int128_t, OffsetT, op_type::plus, primitive_accum::no, offset_size::_8, accum_size::_16>
// {};
#endif

template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
struct policy_hub
{
// For large values, use timesliced loads/stores to fit shared memory.
Expand Down Expand Up @@ -291,13 +473,44 @@ struct policy_hub
using ScanPolicyT = decltype(select_agent_policy<sm90_tuning<AccumT, is_primitive_op<ScanOpT>()>>(0));
};

using MaxPolicy = Policy900;
struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
{
// Use values from tuning if a specialization exists that matches a benchmark, otherwise pick Policy900
template <typename Tuning,
typename IVT,
// In the tuning benchmarks the Initial-, Input- and OutputType are the same. Let's check that the
// accumulator type's size matches what we used during the benchmark since that has an impact (The
// tunings also check later that it's a primitive type, so arithmetic impact is also comparable to the
// benchmark). Input- and OutputType only impact loading and storing data (all arithmetic is done in the
// accumulator type), so let's check that they are the same size and dispatch the size in the tunings.
::cuda::std::enable_if_t<sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, IVT, IVT>)
&& sizeof(IVT) == sizeof(OutputValueT),
int> = 0>
static auto select_agent_policy100(int)
-> AgentScanPolicy<Tuning::threads,
Tuning::items,
AccumT,
Tuning::load_algorithm,
Tuning::load_modifier,
Tuning::store_algorithm,
BLOCK_SCAN_WARP_SCANS,
MemBoundScaling<Tuning::threads, Tuning::items, AccumT>,
typename Tuning::delay_constructor>;
template <typename Tuning, typename IVT>
static auto select_agent_policy100(long) -> typename Policy900::ScanPolicyT;

using ScanPolicyT =
decltype(select_agent_policy100<sm100_tuning<InputValueT, AccumT, OffsetT, classify_op<ScanOpT>()>, InputValueT>(
0));
};

using MaxPolicy = Policy1000;
};
} // namespace scan
} // namespace detail

template <typename AccumT, typename ScanOpT = ::cuda::std::plus<>>
using DeviceScanPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will be "
"removed.") = detail::scan::policy_hub<AccumT, ScanOpT>;
"removed.") = detail::scan::policy_hub<int, int, AccumT, int, ScanOpT>;

CUB_NAMESPACE_END
10 changes: 10 additions & 0 deletions thrust/testing/scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,16 @@ struct iterator_traits<only_set_when_expected_it>
};
THRUST_NAMESPACE_END

namespace std
{
template <>
struct iterator_traits<only_set_when_expected_it>
{
using value_type = long long;
using reference = only_set_when_expected_it;
};
} // namespace std

void TestInclusiveScanWithBigIndexesHelper(int magnitude)
{
thrust::constant_iterator<long long> begin(1);
Expand Down
Loading

0 comments on commit 071ddc9

Please sign in to comment.