Skip to content

Commit

Permalink
Replace CUB macros in tunings and benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Feb 25, 2025
1 parent 32c246b commit a7d093a
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 14 deletions.
5 changes: 3 additions & 2 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/cstdint>
#include <cuda/std/functional> // ::cuda::std::identity
#include <cuda/std/variant>
Expand Down Expand Up @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);

// Implement part of MemBoundScaling
items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);

return {block_size, items_per_thread, vector_load_length};
}
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/select/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ struct policy_hub_t
{
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
9 changes: 5 additions & 4 deletions cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>

#include <cuda/cmath>
#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/__algorithm/max.h>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -315,9 +317,8 @@ struct policy_hub
static constexpr int items =
(max_input_bytes <= 8)
? 6
// TODO(bgruber): use clamp() and ceil_div in C++14
: CUB_MIN(nominal_4B_items_per_thread,
CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
: ::cuda::std::clamp(
::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
using ReduceByKeyPolicyT =
AgentReduceByKeyPolicy<128,
items,
Expand Down Expand Up @@ -603,7 +604,7 @@ struct policy_hub
static constexpr int nominal_4B_items_per_thread = 15;
// TODO(bgruber): use clamp() in C++14
static constexpr int ITEMS_PER_THREAD =
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT))));
_CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread);
using RleSweepPolicyT =
AgentRlePolicy<96,
ITEMS_PER_THREAD,
Expand Down
5 changes: 3 additions & 2 deletions cub/cub/device/dispatch/tuning/tuning_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/clamp.h>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down Expand Up @@ -1498,9 +1500,8 @@ struct policy_hub
struct DefaultPolicy
{
static constexpr int nominal_4B_items_per_thread = 10;
// TODO(bgruber): use cuda::std::clamp() in C++14
static constexpr int items_per_thread =
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT))));
::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
using SelectIfPolicyT =
AgentSelectIfPolicy<128,
items_per_thread,
Expand Down

0 comments on commit a7d093a

Please sign in to comment.