Skip to content

Commit 1b337c0

Browse files
Add prefetch transform kernel
Fixes: #2363
1 parent 51339ce commit 1b337c0

File tree

6 files changed

+136
-15
lines changed

6 files changed

+136
-15
lines changed

cub/benchmarks/bench/transform/babelstream.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ struct policy_hub_t
2323
using algo_policy =
2424
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
2525
cub::detail::transform::fallback_for_policy,
26-
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
26+
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
27+
cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
28+
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>>;
2729
};
2830
};
2931
#endif

cub/benchmarks/bench/transform/babelstream1.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/benchmarks/bench/transform/babelstream2.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/benchmarks/bench/transform/babelstream3.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
// SPDX-License-Identifier: BSD-3-Clause
33

44
// %RANGE% TUNE_THREADS tpb 128:1024:128
5-
// %RANGE% TUNE_ALGORITHM alg 0:1:1
5+
// %RANGE% TUNE_ALGORITHM alg 0:2:1
66

77
// keep checks at the top so compilation of discarded variants fails really fast
88
#if !TUNE_BASE
9-
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
9+
# if TUNE_ALGORITHM == 2 && (__CUDA_ARCH_LIST__) < 900
1010
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
1111
# endif
1212

13-
# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
13+
# if TUNE_ALGORITHM == 2 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
1414
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
1515
# endif
1616
#endif

cub/cub/device/dispatch/dispatch_transform.cuh

+121-4
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
8989
enum class Algorithm
9090
{
9191
fallback_for,
92+
prefetch,
9293
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
9394
ublkcp,
9495
#endif // _CUB_HAS_TRANSFORM_UBLKCP
@@ -133,6 +134,90 @@ _CCCL_DEVICE void transform_kernel_impl(
133134
}
134135
}
135136

137+
template <int BlockThreads>
138+
struct prefetch_policy_t
139+
{
140+
static constexpr int block_threads = BlockThreads;
141+
// items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
142+
static constexpr int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling
143+
static constexpr int min_items_per_thread = 1;
144+
static constexpr int max_items_per_thread = 32;
145+
};
146+
147+
// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
148+
// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
149+
// as we need to create less instructions to prefetch the same amount of data.
150+
template <typename T>
151+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
152+
{
153+
assert(__isGlobal(addr));
154+
// TODO(bgruber): prefetch to L1 may be even better
155+
asm volatile("prefetch.global.L2 [%0];" : : "l"(addr) : "memory");
156+
}
157+
158+
// overload for any iterator that is not a pointer, do nothing
159+
template <typename It, ::cuda::std::__enable_if_t<!::cuda::std::is_pointer<It>::value, int> = 0>
160+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(It)
161+
{}
162+
163+
// this kernel guarantees stable addresses for the parameters of the user provided function
164+
template <typename PrefetchPolicy,
165+
typename Offset,
166+
typename F,
167+
typename RandomAccessIteratorOut,
168+
typename... RandomAccessIteratorIn>
169+
_CCCL_DEVICE void transform_kernel_impl(
170+
::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
171+
Offset num_items,
172+
int num_elem_per_thread,
173+
F f,
174+
RandomAccessIteratorOut out,
175+
RandomAccessIteratorIn... ins)
176+
{
177+
constexpr int block_dim = PrefetchPolicy::block_threads;
178+
const int tile_stride = block_dim * num_elem_per_thread;
179+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride;
180+
const int tile_size = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_stride}));
181+
182+
// move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
183+
{
184+
int dummy[] = {(ins += offset, 0)..., 0};
185+
(void) &dummy;
186+
out += offset;
187+
}
188+
189+
for (int j = 0; j < num_elem_per_thread; ++j)
190+
{
191+
const int idx = j * block_dim + threadIdx.x;
192+
// TODO(bgruber): replace by fold over comma in C++17
193+
int dummy[] = {(prefetch(ins + idx), 0)..., 0}; // extra zero to handle empty packs
194+
(void) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
195+
}
196+
197+
#define PREFETCH_AGENT(full_tile) \
198+
/* ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost */ \
199+
/* TODO(bgruber): A6000 disagrees */ \
200+
_Pragma("unroll 1") for (int j = 0; j < num_elem_per_thread; ++j) \
201+
{ \
202+
const int idx = j * block_dim + threadIdx.x; \
203+
if (full_tile || idx < tile_size) \
204+
{ \
205+
/* we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) */ \
206+
out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...); \
207+
} \
208+
}
209+
210+
if (tile_stride == tile_size)
211+
{
212+
PREFETCH_AGENT(true);
213+
}
214+
else
215+
{
216+
PREFETCH_AGENT(false);
217+
}
218+
#undef PREFETCH_AGENT
219+
}
220+
136221
template <int BlockThreads>
137222
struct async_copy_policy_t
138223
{
@@ -543,8 +628,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
543628
{
544629
static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
545630
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
546-
static constexpr auto algorithm = Algorithm::fallback_for;
547-
using algo_policy = fallback_for_policy;
631+
static constexpr auto algorithm = Algorithm::prefetch;
632+
using algo_policy = prefetch_policy_t<256>;
548633
};
549634

550635
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
@@ -566,8 +651,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
566651

567652
static constexpr bool use_fallback =
568653
RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
569-
static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
570-
using algo_policy = ::cuda::std::_If<use_fallback, fallback_for_policy, async_policy>;
654+
static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
655+
using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t<256>, async_policy>;
571656
};
572657

573658
using max_policy = policy900;
@@ -828,6 +913,38 @@ struct dispatch_t<RequiresStableAddress,
828913
make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
829914
}
830915

916+
template <typename ActivePolicy, std::size_t... Is>
917+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
918+
invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>)
919+
{
920+
using policy_t = typename ActivePolicy::algo_policy;
921+
constexpr int block_dim = policy_t::block_threads;
922+
int max_occupancy = 0;
923+
const auto error = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
924+
if (error != cudaSuccess)
925+
{
926+
return error;
927+
}
928+
929+
const int items_per_thread =
930+
loaded_bytes_per_iter == 0
931+
? +policy_t::items_per_thread_no_input
932+
: ::cuda::ceil_div(ActivePolicy::min_bif, max_occupancy * block_dim * loaded_bytes_per_iter);
933+
const int items_per_thread_clamped =
934+
::cuda::std::clamp(items_per_thread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
935+
const int tile_size = block_dim * items_per_thread_clamped;
936+
const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
937+
return CubDebug(
938+
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, 0, stream)
939+
.doit(
940+
CUB_DETAIL_TRANSFORM_KERNEL_PTR,
941+
num_items,
942+
items_per_thread_clamped,
943+
op,
944+
out,
945+
make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
946+
}
947+
831948
template <typename ActivePolicy>
832949
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
833950
{

cub/test/catch2_test_device_transform.cu

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ struct policy_hub_for_alg
3434
using algo_policy =
3535
::cuda::std::_If<Alg == Algorithm::fallback_for,
3636
cub::detail::transform::fallback_for_policy,
37-
cub::detail::transform::async_copy_policy_t<256>>;
37+
::cuda::std::_If<Alg == Algorithm::prefetch,
38+
cub::detail::transform::prefetch_policy_t<256>,
39+
cub::detail::transform::async_copy_policy_t<256>>>;
3840
};
3941
};
4042

0 commit comments

Comments
 (0)