Skip to content

Commit

Permalink
Replace CUB macros in tunings and benchmarks (#3931)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Feb 26, 2025
1 parent 7fbbd24 commit a52e626
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 31 deletions.
5 changes: 3 additions & 2 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <cuda/std/__algorithm_>
#include <cuda/std/cstdint>
#include <cuda/std/functional> // ::cuda::std::identity
#include <cuda/std/variant>
Expand Down Expand Up @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);

// Implement part of MemBoundScaling
items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);

return {block_size, items_per_thread, vector_load_length};
}
Expand Down
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>
#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +64,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>
#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +64,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/select/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -61,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/select/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>

#include <limits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +65,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
6 changes: 4 additions & 2 deletions cub/benchmarks/bench/select/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <cub/device/device_select.cuh>

#include <cuda/std/__algorithm_>

#include <limits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -36,8 +38,8 @@ struct policy_hub_t
{
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/max.h>
#include <cuda/cmath>
#include <cuda/std/__algorithm_>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -315,9 +316,8 @@ struct policy_hub
static constexpr int items =
(max_input_bytes <= 8)
? 6
// TODO(bgruber): use clamp() and ceil_div in C++14
: CUB_MIN(nominal_4B_items_per_thread,
CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
: ::cuda::std::clamp(
::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
using ReduceByKeyPolicyT =
AgentReduceByKeyPolicy<128,
items,
Expand Down Expand Up @@ -603,7 +603,7 @@ struct policy_hub
static constexpr int nominal_4B_items_per_thread = 15;
// TODO(bgruber): use clamp() in C++14
static constexpr int ITEMS_PER_THREAD =
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT))));
_CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread);
using RleSweepPolicyT =
AgentRlePolicy<96,
ITEMS_PER_THREAD,
Expand Down
5 changes: 3 additions & 2 deletions cub/cub/device/dispatch/tuning/tuning_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm_>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down Expand Up @@ -1498,9 +1500,8 @@ struct policy_hub
struct DefaultPolicy
{
static constexpr int nominal_4B_items_per_thread = 10;
// TODO(bgruber): use cuda::std::clamp() in C++14
static constexpr int items_per_thread =
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT))));
::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
using SelectIfPolicyT =
AgentSelectIfPolicy<128,
items_per_thread,
Expand Down
32 changes: 16 additions & 16 deletions thrust/testing/zip_iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -332,25 +332,25 @@ struct TestZipIteratorTransform
device_vector<T> d_result(n);

// Tuples with 2 elements
transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
make_zip_iterator(h_data0.end(), h_data1.end()),
h_result.begin(),
SumTwoTuple());
transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
make_zip_iterator(d_data0.end(), d_data1.end()),
d_result.begin(),
SumTwoTuple());
thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
make_zip_iterator(h_data0.end(), h_data1.end()),
h_result.begin(),
SumTwoTuple());
thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
make_zip_iterator(d_data0.end(), d_data1.end()),
d_result.begin(),
SumTwoTuple());
ASSERT_EQUAL(h_result, d_result);

// Tuples with 3 elements
transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
h_result.begin(),
SumThreeTuple());
transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
d_result.begin(),
SumThreeTuple());
thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
h_result.begin(),
SumThreeTuple());
thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
d_result.begin(),
SumThreeTuple());
ASSERT_EQUAL(h_result, d_result);
}
};
Expand Down

0 comments on commit a52e626

Please sign in to comment.