diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu index 87abdfef6ff..c3b9306398d 100644 --- a/cub/benchmarks/bench/transform/babelstream1.cu +++ b/cub/benchmarks/bench/transform/babelstream1.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu index c8fa017b788..61d4e905d92 100644 --- a/cub/benchmarks/bench/transform/babelstream2.cu +++ b/cub/benchmarks/bench/transform/babelstream2.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu index db541554210..a5c969764ae 100644 --- a/cub/benchmarks/bench/transform/babelstream3.cu +++ b/cub/benchmarks/bench/transform/babelstream3.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/common.h similarity index 100% rename from cub/benchmarks/bench/transform/babelstream.h rename to cub/benchmarks/bench/transform/common.h diff --git a/cub/benchmarks/bench/transform/other.cu b/cub/benchmarks/bench/transform/other.cu new file mode 100644 index 00000000000..94cc14a032c --- /dev/null +++ b/cub/benchmarks/bench/transform/other.cu @@ -0,0 +1,195 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "common.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +// This benchmark is compute intensive with diverging threads + +template +struct fib_t +{ + __device__ OutputT operator()(IndexT n) + { + OutputT t1 = 0; + OutputT t2 = 1; + + if (n < 1) + { + return t1; + } + if (n == 1) + { + return t1; + } + if (n == 2) + { + return t2; + } + for (IndexT i = 3; i <= n; ++i) + { + const auto next = t1 + t2; + t1 = t2; + t2 = next; + } + return t2; + } +}; + +template +static void fibonacci(nvbench::state& state, nvbench::type_list) +{ + using index_t = int64_t; + using output_t = uint32_t; + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42}); + thrust::device_vector out(n); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t{}); +} + +// TODO(bgruber): hardcode OffsetT? +NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types)) + .set_name("fibonacci") + .set_type_axes_names({"OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +// This benchmark tests overlapping memory regions for reading and is compute intensive + +template +static void compare_complex(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n); + thrust::device_vector out(n - 1); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + // the complex comparison needs lots of compute and transform reads from overlapping input + using compare_op = less_t; + bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{}); +} + +// TODO(bgruber): hardcode OffsetT? +NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types)) + .set_name("compare_complex") + .set_type_axes_names({"OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +// This benchmark overwrites it inputs, has a uniform workload and is compute intensive + +struct Transform +{ + float mat[3][4]; + + auto operator()(float3 v) const -> float3 + { + float3 r; + r.x = mat[0][0] * v.x + mat[0][1] * v.y + mat[0][2] * v.z + mat[0][3]; + r.y = mat[1][0] * v.x + mat[1][1] * v.y + mat[1][2] * v.z + mat[1][3]; + r.z = mat[2][0] * v.x + mat[2][1] * v.y + mat[2][2] * v.z + mat[2][3]; + return r; + } +}; + +template +static void vertex_transform(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector data = generate(n); + const auto transform = Transform{{{1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}}}; + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + bench_transform(state, ::cuda::std::tuple{data.begin()}, data.begin(), n, transform); +} + +// TODO(bgruber): hardcode OffsetT? +NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types)) + .set_name("vertex_transform") + .set_type_axes_names({"OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +// This benchmark uses a LOT of registers and is compute intensive. It was gifted by ahendriksen. It is very expensive +// to compile. + +template +struct heavy_functor +{ + // we need to use an unsigned type so overflow in arithmetic wraps around + _CCCL_HOST_DEVICE std::uint32_t operator()(std::uint32_t data) const + { + std::uint32_t reg[N]; + reg[0] = data; + for (int i = 1; i < N; ++i) + { + reg[i] = reg[i - 1] * reg[i - 1] + 1; + } + for (int i = 0; i < N; ++i) + { + reg[i] = (reg[i] * reg[i]) % 19; + } + for (int i = 0; i < N; ++i) + { + reg[i] = reg[N - i - 1] * reg[i]; + } + std::uint32_t x = 0; + for (int i = 0; i < N; ++i) + { + x += reg[i]; + } + return x; + } +}; + +template +static void heavy(nvbench::state& state, nvbench::type_list) +{ + using value_t = std::uint32_t; + using offset_t = int; + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n); + thrust::device_vector out(n); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor{}); +} + +template +using ic = ::cuda::std::integral_constant; + +// TODO(bgruber): hardcode OffsetT? +NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list, ic<64>, ic<128>, ic<256>>)) + .set_name("heavy") + .set_type_axes_names({"Heaviness{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 88b189cf964..9c16bee3033 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -32,6 +32,19 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type"); +template +struct nvbench::type_strings<::cuda::std::integral_constant> +{ + static std::string input_string() + { + return std::to_string(I); + } + static std::string description() + { + return "integral_constant<" + type_strings::description() + ", " + std::to_string(I) + ">"; + } +}; + namespace detail {