Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add b200 tunings for scan.exclusive.sum #3559

Merged
merged 11 commits into from
Feb 7, 2025
19 changes: 10 additions & 9 deletions cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,18 @@ struct policy_hub_t
template <typename T, typename OffsetT>
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = cub::detail::InputValue<T>;
using accum_t = ::cuda::std::__accumulator_t<op_t, T, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using init_t = T;
using wrapped_init_t = cub::detail::InputValue<init_t>;
using accum_t = ::cuda::std::__accumulator_t<op_t, init_t, T>;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t, policy_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, policy_t>;
#else
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t>;
using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand All @@ -114,7 +115,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)

size_t tmp_size;
dispatch_t::Dispatch(
nullptr, tmp_size, d_input, d_output, op_t{}, init_t{T{}}, static_cast<int>(input.size()), 0 /* stream */);
nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{T{}}, static_cast<int>(input.size()), 0 /* stream */);

thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
Expand All @@ -126,7 +127,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
d_input,
d_output,
op_t{},
init_t{T{}},
wrapped_init_t{T{}},
static_cast<int>(input.size()),
launch.get_stream());
});
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1165,7 +1165,8 @@ struct DeviceScan
detail::InputValue<InitValueT>,
OffsetT,
AccumT,
detail::scan::policy_hub<AccumT, ScanOpT>,
detail::scan::
policy_hub<detail::value_t<InputIteratorT>, detail::value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
ForceInclusive::Yes>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
Expand Down
47 changes: 24 additions & 23 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -133,29 +133,30 @@ struct DeviceScanKernelSource
* Enum flag to specify whether to enforce inclusive scan.
*
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename PolicyHub = detail::scan::policy_hub<AccumT, ScanOpT>,
ForceInclusive EnforceInclusive = ForceInclusive::No,
typename KernelSource = detail::scan::DeviceScanKernelSource<
typename PolicyHub::MaxPolicy,
InputIteratorT,
OutputIteratorT,
ScanOpT,
InitValueT,
OffsetT,
AccumT,
EnforceInclusive>,
typename KernelLauncherFactory = detail::TripleChevronFactory>

template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename OffsetT,
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
cub::detail::value_t<InputIteratorT>,
::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
cub::detail::value_t<InputIteratorT>,
typename InitValueT::value_type>>,
typename PolicyHub =
detail::scan::policy_hub<detail::value_t<InputIteratorT>, detail::value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
ForceInclusive EnforceInclusive = ForceInclusive::No,
typename KernelSource = detail::scan::DeviceScanKernelSource<
typename PolicyHub::MaxPolicy,
InputIteratorT,
OutputIteratorT,
ScanOpT,
InitValueT,
OffsetT,
AccumT,
EnforceInclusive>,
typename KernelLauncherFactory = detail::TripleChevronFactory>
struct DispatchScan
{
//---------------------------------------------------------------------
Expand Down
219 changes: 216 additions & 3 deletions cub/cub/device/dispatch/tuning/tuning_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>

#include <cuda/functional>
#include <cuda/std/functional>

CUB_NAMESPACE_BEGIN
Expand All @@ -69,12 +70,26 @@ enum class primitive_op
no,
yes
};
enum class op_type
{
plus,
unknown
};
enum class offset_size
{
_4,
_8,
unknown
};
enum class value_size
{
_1,
_2,
_4,
_8,
_16,
unknown
};
enum class accum_size
{
_1,
Expand All @@ -97,6 +112,36 @@ constexpr primitive_op is_primitive_op()
return basic_binary_op_t<ScanOpT>::value ? primitive_op::yes : primitive_op::no;
}

template <typename Op>
struct is_plus
{
static constexpr bool value = false;
};

template <typename T>
struct is_plus<::cuda::std::plus<T>>
{
static constexpr bool value = true;
};

template <class ScanOpT>
constexpr op_type classify_op()
{
return is_plus<ScanOpT>::value ? op_type::plus : op_type::unknown;
}

template <class ValueT>
constexpr value_size classify_value_size()
{
return sizeof(ValueT) == 1 ? value_size::_1
: sizeof(ValueT) == 2 ? value_size::_2
: sizeof(ValueT) == 4 ? value_size::_4
: sizeof(ValueT) == 8 ? value_size::_8
: sizeof(ValueT) == 16
? value_size::_16
: value_size::unknown;
}

template <class AccumT>
constexpr accum_size classify_accum_size()
{
Expand All @@ -109,6 +154,12 @@ constexpr accum_size classify_accum_size()
: accum_size::unknown;
}

template <class OffsetT>
constexpr offset_size classify_offset_size()
{
return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
}

template <class AccumT,
primitive_op PrimitiveOp,
primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
Expand Down Expand Up @@ -230,6 +281,137 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si
#endif
// clang-format on

template <class ValueT,
class AccumT,
class OffsetT,
op_type OpTypeT,
primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
offset_size OffsetSize = classify_offset_size<OffsetT>(),
value_size ValueSize = classify_value_size<ValueT>()>
struct sm100_tuning;

// sum
template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_1>
{
// ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818 1.005682 1.173041 1.305288
static constexpr int items = 18;
static constexpr int threads = 512;
using delay_constructor = exponential_backon_constructor_t<768, 820>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_1>
{
// ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210 1.000000 1.100637 1.307692
static constexpr int items = 14;
static constexpr int threads = 384;
using delay_constructor = exponential_backon_constructor_t<228, 775>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_2>
{
// ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443 1.002841 1.119688 1.307692
static constexpr int items = 13;
static constexpr int threads = 512;
using delay_constructor = exponential_backon_constructor_t<1384, 720>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

// todo(gonidelis): Regresses for large inputs. Find better tuning.
// template <class ValueT, class AccumT, class OffsetT>
// struct sm100_tuning<ValueT,
// AccumT,
// OffsetT,
// op_type::plus,
// primitive_value::yes,
// primitive_accum::yes,
// offset_size::_8,
// value_size::_2>
// {
// // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934 0.983509 1.077724 1.305288
// static constexpr int items = 13;
// static constexpr int threads = 288;
// using delay_constructor = exponential_backon_jitter_window_constructor_t<1520, 895>;
// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
// static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
// static constexpr CacheLoadModifier load_modifier = LOAD_CA;
// };

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_4>
{
// ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442 0.997167 1.139902 1.462651
static constexpr int items = 22;
static constexpr int threads = 384;
using delay_constructor = exponential_backon_jitter_constructor_t<1904, 830>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_4>
{
// ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142 0.994350 1.137459 1.455636
static constexpr int items = 19;
static constexpr int threads = 416;
using delay_constructor = exponential_backon_constructor_t<956, 550>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_4, value_size::_8>
{
// ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468 1.015581 1.085630 1.264583
static constexpr int items = 23;
static constexpr int threads = 416;
using delay_constructor = exponential_backon_jitter_window_constructor_t<772, 710>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

template <class ValueT, class AccumT, class OffsetT>
struct sm100_tuning<ValueT, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_8>
{
// ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133 1.000000 1.075577 1.248963
static constexpr int items = 22;
static constexpr int threads = 320;
using delay_constructor = exponential_backoff_constructor_t<328, 965>;
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
};

// todo(gonidelis): Add tunings for i128, float and double.
// template <class OffsetT> struct sm100_tuning<float, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8,
// accum_size::_4>;
// Default explicitly so it doesn't pick up the sm100<I64, I64> tuning.
template <class AccumT, class OffsetT>
struct sm100_tuning<double, AccumT, OffsetT, op_type::plus, primitive_accum::yes, offset_size::_8, value_size::_8>
: sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8>
{};

#if CUB_IS_INT128_ENABLED
// template <class OffsetT> struct sm100_tuning<__int128_t, OffsetT, op_type::plus, primitive_accum::no,
// offset_size::_8, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <class OffsetT> struct
// sm100_tuning<__uint128_t, OffsetT, op_type::plus, primitive_accum::no, offset_size::_8, accum_size::_16>
// : sm100_tuning<__int128_t, OffsetT, op_type::plus, primitive_accum::no, offset_size::_8, accum_size::_16>
// {};
#endif

template <typename PolicyT, typename = void, typename = void>
struct ScanPolicyWrapper : PolicyT
{
Expand Down Expand Up @@ -263,7 +445,7 @@ CUB_RUNTIME_FUNCTION ScanPolicyWrapper<PolicyT> MakeScanPolicyWrapper(PolicyT po
return ScanPolicyWrapper<PolicyT>{policy};
}

template <typename AccumT, typename ScanOpT>
template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
struct policy_hub
{
// For large values, use timesliced loads/stores to fit shared memory.
Expand Down Expand Up @@ -327,13 +509,44 @@ struct policy_hub
using ScanPolicyT = decltype(select_agent_policy<sm90_tuning<AccumT, is_primitive_op<ScanOpT>()>>(0));
};

using MaxPolicy = Policy900;
struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
{
// Use values from tuning if a specialization exists that matches a benchmark, otherwise pick Policy900
template <typename Tuning,
typename IVT,
// In the tuning benchmarks the Initial-, Input- and OutputType are the same. Let's check that the
// accumulator type's size matches what we used during the benchmark since that has an impact (The
// tunings also check later that it's a primitive type, so arithmetic impact is also comparable to the
// benchmark). Input- and OutputType only impact loading and storing data (all arithmetic is done in the
// accumulator type), so let's check that they are the same size and dispatch the size in the tunings.
::cuda::std::enable_if_t<sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, IVT, IVT>)
&& sizeof(IVT) == sizeof(OutputValueT),
int> = 0>
static auto select_agent_policy100(int)
-> AgentScanPolicy<Tuning::threads,
Tuning::items,
AccumT,
Tuning::load_algorithm,
Tuning::load_modifier,
Tuning::store_algorithm,
BLOCK_SCAN_WARP_SCANS,
MemBoundScaling<Tuning::threads, Tuning::items, AccumT>,
typename Tuning::delay_constructor>;
template <typename Tuning, typename IVT>
static auto select_agent_policy100(long) -> typename Policy900::ScanPolicyT;

using ScanPolicyT =
decltype(select_agent_policy100<sm100_tuning<InputValueT, AccumT, OffsetT, classify_op<ScanOpT>()>, InputValueT>(
0));
};

using MaxPolicy = Policy1000;
};
} // namespace scan
} // namespace detail

template <typename AccumT, typename ScanOpT = ::cuda::std::plus<>>
using DeviceScanPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will be "
"removed.") = detail::scan::policy_hub<AccumT, ScanOpT>;
"removed.") = detail::scan::policy_hub<int, int, AccumT, int, ScanOpT>;

CUB_NAMESPACE_END
Loading
Loading