Skip to content

Commit

Permalink
Move some CUB tunings to dedicated headers
Browse files Browse the repository at this point in the history
Move the policy hubs of
* adjacent_difference
* batch_memcpy
* dispatch_radix_sort
to dedicated headers.
  • Loading branch information
bernhardmgruber committed Dec 9, 2024
1 parent cdb714f commit 37d8764
Show file tree
Hide file tree
Showing 8 changed files with 1,117 additions and 951 deletions.
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ struct policy_hub_t
block_delay_constructor_t>;

using AgentLargeBufferPolicyT =
cub::detail::AgentBatchMemcpyLargeBuffersPolicy<TUNE_LARGE_THREADS, TUNE_LARGE_BUFFER_BYTES_PER_THREAD>;
cub::detail::batch_memcpy::agent_large_buffer_policy<TUNE_LARGE_THREADS, TUNE_LARGE_BUFFER_BYTES_PER_THREAD>;
};

using MaxPolicy = policy_t;
Expand Down Expand Up @@ -189,7 +189,7 @@ void copy(nvbench::state& state,
#if !TUNE_BASE
using policy_t = policy_hub_t;
#else
using policy_t = cub::detail::DeviceBatchMemcpyPolicy<buffer_offset_t, block_offset_t>;
using policy_t = cub::detail::batch_memcpy::policy_hub<buffer_offset_t, block_offset_t>;
#endif

using dispatch_t = cub::detail::DispatchBatchMemcpy<
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/device_copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ struct DeviceCopy
SizeIteratorT,
RangeOffsetT,
BlockOffsetT,
detail::DeviceBatchMemcpyPolicy<RangeOffsetT, BlockOffsetT>,
detail::batch_memcpy::policy_hub<RangeOffsetT, BlockOffsetT>,
false>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
}
};
Expand Down
33 changes: 1 addition & 32 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

#include <cub/agent/agent_adjacent_difference.cuh>
#include <cub/detail/type_traits.cuh>
#include <cub/device/dispatch/tuning/tuning_adjacent_difference.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
Expand Down Expand Up @@ -101,38 +102,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
agent.Process(tile_idx, tile_base);
}

template <typename InputIteratorT, bool MayAlias = true>
struct DeviceAdjacentDifferencePolicy
{
using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;

//------------------------------------------------------------------------------
// Architecture-specific tuning policies
//------------------------------------------------------------------------------

struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
{
using AdjacentDifferencePolicy =
AgentAdjacentDifferencePolicy<128,
Nominal8BItemsToItems<ValueT>(7),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
{
using AdjacentDifferencePolicy =
AgentAdjacentDifferencePolicy<128,
Nominal8BItemsToItems<ValueT>(7),
BLOCK_LOAD_WARP_TRANSPOSE,
MayAlias ? LOAD_CA : LOAD_LDG,
BLOCK_STORE_WARP_TRANSPOSE>;
};

using MaxPolicy = Policy350;
};

template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT,
Expand Down
73 changes: 2 additions & 71 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include <cub/agent/agent_batch_memcpy.cuh>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/detail/temporary_storage.cuh>
#include <cub/device/dispatch/tuning/tuning_batch_memcpy.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_device.cuh>
Expand All @@ -61,19 +62,6 @@ CUB_NAMESPACE_BEGIN

namespace detail
{

/**
* Parameterizable tuning policy type for AgentBatchMemcpy
*/
template <uint32_t _BLOCK_THREADS, uint32_t _BYTES_PER_THREAD>
struct AgentBatchMemcpyLargeBuffersPolicy
{
/// Threads per thread block
static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
/// The number of bytes each thread copies
static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD;
};

/**
* Initialization kernel for tile status initialization (multi-block)
*/
Expand Down Expand Up @@ -281,63 +269,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
.ConsumeTile(blockIdx.x);
}

template <class BufferOffsetT, class BlockOffsetT>
struct DeviceBatchMemcpyPolicy
{
static constexpr uint32_t BLOCK_THREADS = 128U;
static constexpr uint32_t BUFFERS_PER_THREAD = 4U;
static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;

static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS = 256U;
static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;

static constexpr uint32_t WARP_LEVEL_THRESHOLD = 128;
static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024;

using buff_delay_constructor_t = detail::default_delay_constructor_t<BufferOffsetT>;
using block_delay_constructor_t = detail::default_delay_constructor_t<BlockOffsetT>;

/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
static constexpr bool PREFER_POW2_BITS = true;
using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy<
BLOCK_THREADS,
BUFFERS_PER_THREAD,
TLEV_BYTES_PER_THREAD,
PREFER_POW2_BITS,
LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD,
WARP_LEVEL_THRESHOLD,
BLOCK_LEVEL_THRESHOLD,
buff_delay_constructor_t,
block_delay_constructor_t>;

using AgentLargeBufferPolicyT =
AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
};

/// SM70
struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
{
static constexpr bool PREFER_POW2_BITS = false;
using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy<
BLOCK_THREADS,
BUFFERS_PER_THREAD,
TLEV_BYTES_PER_THREAD,
PREFER_POW2_BITS,
LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD,
WARP_LEVEL_THRESHOLD,
BLOCK_LEVEL_THRESHOLD,
buff_delay_constructor_t,
block_delay_constructor_t>;

using AgentLargeBufferPolicyT =
AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
};

using MaxPolicy = Policy700;
};

/**
* @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers
* to the source memory buffers
Expand All @@ -354,7 +285,7 @@ template <typename InputBufferIt,
typename BufferSizeIteratorT,
typename BufferOffsetT,
typename BlockOffsetT,
typename SelectedPolicy = DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT>,
typename SelectedPolicy = batch_memcpy::policy_hub<BufferOffsetT, BlockOffsetT>,
bool IsMemcpy = true>
struct DispatchBatchMemcpy : SelectedPolicy
{
Expand Down
Loading

0 comments on commit 37d8764

Please sign in to comment.