Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extend CUB reduce benchmarks (NVIDIA#3401)
Browse files Browse the repository at this point in the history
* Rename max.cu to custom.cu, since it uses a custom operator
* Extend types covered my min.cu to all fundamental types
* Add some notes on how to collect tuning parameters

Fixes: NVIDIA#3283
bernhardmgruber authored and davebayer committed Jan 22, 2025
1 parent a4a5f7d commit afa4f38
Showing 5 changed files with 36 additions and 13 deletions.
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Original file line number Diff line number Diff line change
@@ -25,11 +25,18 @@
*
******************************************************************************/

// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.

// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main
// use is to detect regressions.

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = max_t;
using value_types = all_types;
using op_t = max_t;
#include "base.cuh"
17 changes: 13 additions & 4 deletions cub/benchmarks/bench/reduce/min.cu
Original file line number Diff line number Diff line change
@@ -24,14 +24,23 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
// It specifically uses cuda::minimum<> instead of a user-defined operator.
#define TUNE_T int16_t

// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
// instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
// Tuning parameters found for signed integer types apply equally for unsigned integer types
// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?

#include <cuda/functional>

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::minimum<>;
// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
using value_types = fundamental_types;
using op_t = ::cuda::minimum<>;
#include "base.cuh"
9 changes: 8 additions & 1 deletion cub/benchmarks/bench/reduce/sum.cu
Original file line number Diff line number Diff line change
@@ -25,11 +25,18 @@
*
******************************************************************************/

// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for signed integer types apply equally for unsigned integer types

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::std::plus<>;
// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
using value_types = all_types;
using op_t = ::cuda::std::plus<>;
#include "base.cuh"
12 changes: 6 additions & 6 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
@@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
namespace detail
{

template <class T, class List>
template <class List, class... Ts>
struct push_back
{};

template <class T, class... As>
struct push_back<T, nvbench::type_list<As...>>
template <class... As, class... Ts>
struct push_back<nvbench::type_list<As...>, Ts...>
{
using type = nvbench::type_list<As..., T>;
using type = nvbench::type_list<As..., Ts...>;
};

} // namespace detail

template <class T, class List>
using push_back_t = typename detail::push_back<T, List>::type;
template <class List, class... Ts>
using push_back_t = typename detail::push_back<List, Ts...>::type;

#ifdef TUNE_OffsetT
using offset_types = nvbench::type_list<TUNE_OffsetT>;

0 comments on commit afa4f38

Please sign in to comment.