Skip to content

Commit 2f05ef3

Browse files
Add prefetching kernel as new fallback for cub::DeviceTransform (#2396)
Fixes: #2363
1 parent ae8f51d commit 2f05ef3

File tree

6 files changed

+224
-35
lines changed

6 files changed

+224
-35
lines changed

cub/benchmarks/bench/transform/babelstream.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ struct policy_hub_t
2323
using algo_policy =
2424
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
2525
cub::detail::transform::fallback_for_policy,
26-
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
26+
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
27+
cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
28+
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>>;
2729
};
2830
};
2931
#endif

cub/benchmarks/bench/transform/babelstream1.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/benchmarks/bench/transform/babelstream2.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/benchmarks/bench/transform/babelstream3.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/cub/device/dispatch/dispatch_transform.cuh

+207-23
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
8989
enum class Algorithm
9090
{
9191
fallback_for,
92+
prefetch,
9293
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
9394
ublkcp,
9495
#endif // _CUB_HAS_TRANSFORM_UBLKCP
@@ -133,6 +134,116 @@ _CCCL_DEVICE void transform_kernel_impl(
133134
}
134135
}
135136

137+
template <typename T>
138+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
139+
{
140+
#if _CCCL_STD_VER > 2011
141+
_CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
142+
#endif // _CCCL_STD_VER > 2011
143+
return reinterpret_cast<const char*>(
144+
reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
145+
}
146+
147+
template <int BlockThreads>
148+
struct prefetch_policy_t
149+
{
150+
static constexpr int block_threads = BlockThreads;
151+
// items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
152+
static constexpr int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling
153+
static constexpr int min_items_per_thread = 1;
154+
static constexpr int max_items_per_thread = 32;
155+
};
156+
157+
// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
158+
// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
159+
// as we need to create less instructions to prefetch the same amount of data.
160+
template <typename T>
161+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
162+
{
163+
// TODO(bgruber): prefetch to L1 may be even better
164+
asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory");
165+
}
166+
167+
template <int BlockDim, typename T>
168+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(const T* addr, int tile_size)
169+
{
170+
constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to be
171+
// architecture dependent?
172+
const int tile_size_bytes = tile_size * sizeof(T);
173+
// prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
174+
#pragma unroll 1
175+
for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes;
176+
offset += BlockDim * prefetch_byte_stride)
177+
{
178+
prefetch(reinterpret_cast<const char*>(addr) + offset);
179+
}
180+
}
181+
182+
// TODO(miscco): we should probably constrain It to not be a contiguous iterator in C++17 (and change the overload
183+
// above to accept any contiguous iterator)
184+
// overload for any iterator that is not a pointer, do nothing
185+
template <int, typename It, ::cuda::std::__enable_if_t<!::cuda::std::is_pointer<It>::value, int> = 0>
186+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It, int)
187+
{}
188+
189+
// This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
190+
// global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
191+
// parameter yields a global memory address.
192+
template <typename PrefetchPolicy,
193+
typename Offset,
194+
typename F,
195+
typename RandomAccessIteratorOut,
196+
typename... RandomAccessIteratorIn>
197+
_CCCL_DEVICE void transform_kernel_impl(
198+
::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
199+
Offset num_items,
200+
int num_elem_per_thread,
201+
F f,
202+
RandomAccessIteratorOut out,
203+
RandomAccessIteratorIn... ins)
204+
{
205+
constexpr int block_dim = PrefetchPolicy::block_threads;
206+
const int tile_stride = block_dim * num_elem_per_thread;
207+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride;
208+
const int tile_size = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_stride}));
209+
210+
// move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
211+
{
212+
int dummy[] = {(ins += offset, 0)..., 0};
213+
(void) &dummy;
214+
out += offset;
215+
}
216+
217+
{
218+
// TODO(bgruber): replace by fold over comma in C++17
219+
int dummy[] = {(prefetch_tile<block_dim>(ins, tile_size), 0)..., 0}; // extra zero to handle empty packs
220+
(void) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
221+
}
222+
223+
#define PREFETCH_AGENT(full_tile) \
224+
/* ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost */ \
225+
/* bgruber: but A6000 and H100 show small gains without pragma */ \
226+
/*_Pragma("unroll 1")*/ for (int j = 0; j < num_elem_per_thread; ++j) \
227+
{ \
228+
const int idx = j * block_dim + threadIdx.x; \
229+
if (full_tile || idx < tile_size) \
230+
{ \
231+
/* we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) */ \
232+
out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...); \
233+
} \
234+
}
235+
236+
if (tile_stride == tile_size)
237+
{
238+
PREFETCH_AGENT(true);
239+
}
240+
else
241+
{
242+
PREFETCH_AGENT(false);
243+
}
244+
#undef PREFETCH_AGENT
245+
}
246+
136247
template <int BlockThreads>
137248
struct async_copy_policy_t
138249
{
@@ -173,16 +284,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Inte
173284
return (x + mult - 1) & ~(mult - 1);
174285
}
175286

176-
template <typename T>
177-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
178-
{
179-
#if _CCCL_STD_VER > 2011
180-
_CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
181-
#endif // _CCCL_STD_VER > 2011
182-
return reinterpret_cast<const char*>(
183-
reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
184-
}
185-
186287
// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
187288
//
188289
// For performance considerations of memcpy_async:
@@ -543,8 +644,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
543644
{
544645
static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
545646
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
546-
static constexpr auto algorithm = Algorithm::fallback_for;
547-
using algo_policy = fallback_for_policy;
647+
static constexpr auto algorithm = Algorithm::prefetch;
648+
using algo_policy = prefetch_policy_t<256>;
548649
};
549650

550651
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
@@ -566,8 +667,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
566667

567668
static constexpr bool use_fallback =
568669
RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
569-
static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
570-
using algo_policy = ::cuda::std::_If<use_fallback, fallback_for_policy, async_policy>;
670+
static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
671+
using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t<256>, async_policy>;
571672
};
572673

573674
using max_policy = policy900;
@@ -647,13 +748,38 @@ _CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
647748
return max_smem;
648749
}
649750

751+
_CCCL_HOST_DEVICE inline PoorExpected<int> get_sm_count()
752+
{
753+
int device = 0;
754+
auto error = CubDebug(cudaGetDevice(&device));
755+
if (error != cudaSuccess)
756+
{
757+
return error;
758+
}
759+
760+
int sm_count = 0;
761+
error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
762+
if (error != cudaSuccess)
763+
{
764+
return error;
765+
}
766+
767+
return sm_count;
768+
}
769+
650770
struct elem_counts
651771
{
652772
int elem_per_thread;
653773
int tile_size;
654774
int smem_size;
655775
};
656776

777+
struct prefetch_config
778+
{
779+
int max_occupancy;
780+
int sm_count;
781+
};
782+
657783
template <bool RequiresStableAddress,
658784
typename Offset,
659785
typename RandomAccessIteratorTupleIn,
@@ -758,15 +884,11 @@ struct dispatch_t<RequiresStableAddress,
758884
return last_counts;
759885
};
760886
PoorExpected<elem_counts> config = [&]() {
761-
NV_IF_TARGET(
762-
NV_IS_HOST,
763-
(
764-
// this static variable exists for each template instantiation of the surrounding function and class, on which
765-
// the chosen element count solely depends (assuming max SMEM is constant during a program execution)
766-
static auto cached_config = determine_element_counts(); return cached_config;),
767-
(
768-
// we cannot cache the determined element count in device code
769-
return determine_element_counts();));
887+
NV_IF_TARGET(NV_IS_HOST,
888+
(static auto cached_config = determine_element_counts(); return cached_config;),
889+
(
890+
// we cannot cache the determined element count in device code
891+
return determine_element_counts();));
770892
}();
771893
if (!config)
772894
{
@@ -828,6 +950,68 @@ struct dispatch_t<RequiresStableAddress,
828950
make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
829951
}
830952

953+
template <typename ActivePolicy, std::size_t... Is>
954+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
955+
invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>)
956+
{
957+
using policy_t = typename ActivePolicy::algo_policy;
958+
constexpr int block_dim = policy_t::block_threads;
959+
960+
auto determine_config = [&]() -> PoorExpected<prefetch_config> {
961+
int max_occupancy = 0;
962+
const auto error = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
963+
if (error != cudaSuccess)
964+
{
965+
return error;
966+
}
967+
const auto sm_count = get_sm_count();
968+
if (!sm_count)
969+
{
970+
return sm_count.error;
971+
}
972+
return prefetch_config{max_occupancy, *sm_count};
973+
};
974+
975+
PoorExpected<prefetch_config> config = [&]() {
976+
NV_IF_TARGET(
977+
NV_IS_HOST,
978+
(
979+
// this static variable exists for each template instantiation of the surrounding function and class, on which
980+
// the chosen element count solely depends (assuming max SMEM is constant during a program execution)
981+
static auto cached_config = determine_config(); return cached_config;),
982+
(
983+
// we cannot cache the determined element count in device code
984+
return determine_config();));
985+
}();
986+
if (!config)
987+
{
988+
return config.error;
989+
}
990+
991+
const int items_per_thread =
992+
loaded_bytes_per_iter == 0
993+
? +policy_t::items_per_thread_no_input
994+
: ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);
995+
996+
// Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
997+
const int items_per_thread_evenly_spread =
998+
static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
999+
1000+
const int items_per_thread_clamped = ::cuda::std::clamp(
1001+
items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
1002+
const int tile_size = block_dim * items_per_thread_clamped;
1003+
const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
1004+
return CubDebug(
1005+
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, 0, stream)
1006+
.doit(
1007+
CUB_DETAIL_TRANSFORM_KERNEL_PTR,
1008+
num_items,
1009+
items_per_thread_clamped,
1010+
op,
1011+
out,
1012+
make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
1013+
}
1014+
8311015
template <typename ActivePolicy>
8321016
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
8331017
{

cub/test/catch2_test_device_transform.cu

+5-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ struct policy_hub_for_alg
3434
using algo_policy =
3535
::cuda::std::_If<Alg == Algorithm::fallback_for,
3636
cub::detail::transform::fallback_for_policy,
37-
cub::detail::transform::async_copy_policy_t<256>>;
37+
::cuda::std::_If<Alg == Algorithm::prefetch,
38+
cub::detail::transform::prefetch_policy_t<256>,
39+
cub::detail::transform::async_copy_policy_t<256>>>;
3840
};
3941
};
4042

@@ -77,7 +79,8 @@ DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point,
7779

7880
using algorithms =
7981
c2h::enum_type_list<Algorithm,
80-
Algorithm::fallback_for
82+
Algorithm::fallback_for,
83+
Algorithm::prefetch
8184
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
8285
,
8386
Algorithm::ublkcp

0 commit comments

Comments
 (0)