Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move remaining CUB policy hubs to tuning headers #3141

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 2 additions & 51 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#endif // no system header

#include <cub/agent/agent_merge.cuh>
#include <cub/device/dispatch/tuning/tuning_merge.cuh>
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>
#include <cub/util_vsmem.cuh>
Expand Down Expand Up @@ -155,56 +156,6 @@ __launch_bounds__(
vsmem_helper_t::discard_temp_storage(temp_storage);
}

template <typename KeyT, typename ValueT>
struct device_merge_policy_hub
{
static constexpr bool has_values = !::cuda::std::is_same<ValueT, NullType>::value;

using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)];

struct policy300 : ChainedPolicy<300, policy300, policy300>
{
using merge_policy =
agent_policy_t<128,
Nominal4BItemsToItems<tune_type>(7),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy350 : ChainedPolicy<350, policy350, policy300>
{
using merge_policy =
agent_policy_t<256,
Nominal4BItemsToItems<tune_type>(11),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_LDG,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy520 : ChainedPolicy<520, policy520, policy350>
{
using merge_policy =
agent_policy_t<512,
Nominal4BItemsToItems<tune_type>(13),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_LDG,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy600 : ChainedPolicy<600, policy600, policy520>
{
using merge_policy =
agent_policy_t<512,
Nominal4BItemsToItems<tune_type>(15),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

using max_policy = policy600;
};

template <typename KeyIt1,
typename ValueIt1,
typename KeyIt2,
Expand All @@ -213,7 +164,7 @@ template <typename KeyIt1,
typename ValueIt3,
typename Offset,
typename CompareOp,
typename PolicyHub = device_merge_policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
typename PolicyHub = detail::merge::policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
struct dispatch_t
{
void* d_temp_storage;
Expand Down
51 changes: 2 additions & 49 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#endif // no system header

#include <cub/agent/agent_merge_sort.cuh>
#include <cub/device/dispatch/tuning/tuning_merge_sort.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>
Expand Down Expand Up @@ -327,61 +328,13 @@ __launch_bounds__(
* Policy
******************************************************************************/

template <typename KeyIteratorT>
struct DeviceMergeSortPolicy
{
using KeyT = cub::detail::value_t<KeyIteratorT>;

//----------------------------------------------------------------------------
// Architecture-specific tuning policies
//----------------------------------------------------------------------------

struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
using MergeSortPolicy =
AgentMergeSortPolicy<256,
Nominal4BItemsToItems<KeyT>(11),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_LDG,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

// NVBug 3384810
#if defined(_NVHPC_CUDA)
using Policy520 = Policy350;
#else
struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
{
using MergeSortPolicy =
AgentMergeSortPolicy<512,
Nominal4BItemsToItems<KeyT>(15),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_LDG,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};
#endif

struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
{
using MergeSortPolicy =
AgentMergeSortPolicy<256,
Nominal4BItemsToItems<KeyT>(17),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_DEFAULT,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

/// MaxPolicy
using MaxPolicy = Policy600;
};

template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT,
typename SelectedPolicy = DeviceMergeSortPolicy<KeyIteratorT>>
typename SelectedPolicy = detail::merge_sort::policy_hub<KeyIteratorT>>
struct DispatchMergeSort : SelectedPolicy
{
using KeyT = cub::detail::value_t<KeyIteratorT>;
Expand Down
131 changes: 4 additions & 127 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

#include <cub/agent/agent_reduce.cuh>
#include <cub/device/dispatch/kernels/reduce.cuh>
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
#include <cub/grid/grid_even_share.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/launcher/cuda_runtime.cuh>
Expand Down Expand Up @@ -191,130 +192,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
}
}

/******************************************************************************
* Policy
******************************************************************************/

template <typename PolicyT, typename = void>
struct ReducePolicyWrapper : PolicyT
{
CUB_RUNTIME_FUNCTION ReducePolicyWrapper(PolicyT base)
: PolicyT(base)
{}
};

template <typename StaticPolicyT>
struct ReducePolicyWrapper<StaticPolicyT,
_CUDA_VSTD::void_t<typename StaticPolicyT::ReducePolicy,
typename StaticPolicyT::SingleTilePolicy,
typename StaticPolicyT::SegmentedReducePolicy>> : StaticPolicyT
{
CUB_RUNTIME_FUNCTION ReducePolicyWrapper(StaticPolicyT base)
: StaticPolicyT(base)
{}

CUB_DEFINE_SUB_POLICY_GETTER(Reduce)
CUB_DEFINE_SUB_POLICY_GETTER(SingleTile)
CUB_DEFINE_SUB_POLICY_GETTER(SegmentedReduce)
};

template <typename PolicyT>
CUB_RUNTIME_FUNCTION ReducePolicyWrapper<PolicyT> MakeReducePolicyWrapper(PolicyT policy)
{
return ReducePolicyWrapper<PolicyT>{policy};
}

/**
* @tparam AccumT
* Accumulator data type
*
* OffsetT
* Signed integer type for global offsets
*
* ReductionOpT
* Binary reduction functor type having member
* `auto operator()(const T &a, const U &b)`
*/
template <typename AccumT, typename OffsetT, typename ReductionOpT>
struct DeviceReducePolicy
{
//---------------------------------------------------------------------------
// Architecture-specific tuning policies
//---------------------------------------------------------------------------

/// SM30
struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 20;
static constexpr int items_per_vec_load = 2;

// ReducePolicy (GTX670: 154.0 @ 48M 4B items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_DEFAULT>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 20;
static constexpr int items_per_vec_load = 4;

// ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
// items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_LDG>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

/// SM60
struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 16;
static constexpr int items_per_vec_load = 4;

// ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_LDG>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

using MaxPolicy = Policy600;
};

template <typename MaxPolicyT,
typename InputIteratorT,
typename OutputIteratorT,
Expand Down Expand Up @@ -385,7 +262,7 @@ template <typename InputIteratorT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename TransformOpT = ::cuda::std::__identity,
typename KernelSource = DeviceReduceKernelSource<
typename SelectedPolicy::MaxPolicy,
Expand Down Expand Up @@ -867,7 +744,7 @@ template <
typename InitT,
typename AccumT = ::cuda::std::
__accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename SelectedPolicyT = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename KernelSource = DeviceReduceKernelSource<
typename SelectedPolicyT::MaxPolicy,
InputIteratorT,
Expand Down Expand Up @@ -930,7 +807,7 @@ template <typename InputIteratorT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>>
struct DispatchSegmentedReduce : SelectedPolicy
{
//---------------------------------------------------------------------------
Expand Down
Loading
Loading