@@ -89,6 +89,7 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
89
89
enum class Algorithm
90
90
{
91
91
fallback_for,
92
+ prefetch,
92
93
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
93
94
ublkcp,
94
95
#endif // _CUB_HAS_TRANSFORM_UBLKCP
@@ -133,6 +134,90 @@ _CCCL_DEVICE void transform_kernel_impl(
133
134
}
134
135
}
135
136
137
+ template <int BlockThreads>
138
+ struct prefetch_policy_t
139
+ {
140
+ static constexpr int block_threads = BlockThreads;
141
+ // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
142
+ static constexpr int items_per_thread_no_input = 2 ; // when there are no input iterators, the kernel is just filling
143
+ static constexpr int min_items_per_thread = 1 ;
144
+ static constexpr int max_items_per_thread = 32 ;
145
+ };
146
+
147
+ // Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
148
+ // TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
149
+ // as we need to create less instructions to prefetch the same amount of data.
150
+ template <typename T>
151
+ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch (const T* addr)
152
+ {
153
+ assert (__isGlobal (addr));
154
+ // TODO(bgruber): prefetch to L1 may be even better
155
+ asm volatile (" prefetch.global.L2 [%0];" : : " l" (addr) : " memory" );
156
+ }
157
+
158
+ // overload for any iterator that is not a pointer, do nothing
159
+ template <typename It, ::cuda::std::__enable_if_t <!::cuda::std::is_pointer<It>::value, int > = 0 >
160
+ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch (It)
161
+ {}
162
+
163
+ // this kernel guarantees stable addresses for the parameters of the user provided function
164
+ template <typename PrefetchPolicy,
165
+ typename Offset,
166
+ typename F,
167
+ typename RandomAccessIteratorOut,
168
+ typename ... RandomAccessIteratorIn>
169
+ _CCCL_DEVICE void transform_kernel_impl (
170
+ ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
171
+ Offset num_items,
172
+ int num_elem_per_thread,
173
+ F f,
174
+ RandomAccessIteratorOut out,
175
+ RandomAccessIteratorIn... ins)
176
+ {
177
+ constexpr int block_dim = PrefetchPolicy::block_threads;
178
+ const int tile_stride = block_dim * num_elem_per_thread;
179
+ const Offset offset = static_cast <Offset>(blockIdx .x ) * tile_stride;
180
+ const int tile_size = static_cast <int >(::cuda::std::min (num_items - offset, Offset{tile_stride}));
181
+
182
+ // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
183
+ {
184
+ int dummy[] = {(ins += offset, 0 )..., 0 };
185
+ (void ) &dummy;
186
+ out += offset;
187
+ }
188
+
189
+ for (int j = 0 ; j < num_elem_per_thread; ++j)
190
+ {
191
+ const int idx = j * block_dim + threadIdx .x ;
192
+ // TODO(bgruber): replace by fold over comma in C++17
193
+ int dummy[] = {(prefetch (ins + idx), 0 )..., 0 }; // extra zero to handle empty packs
194
+ (void ) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
195
+ }
196
+
197
+ #define PREFETCH_AGENT (full_tile ) \
198
+ /* ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost */ \
199
+ /* TODO(bgruber): A6000 disagrees */ \
200
+ _Pragma (" unroll 1" ) for (int j = 0 ; j < num_elem_per_thread; ++j) \
201
+ { \
202
+ const int idx = j * block_dim + threadIdx .x ; \
203
+ if (full_tile || idx < tile_size) \
204
+ { \
205
+ /* we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) */ \
206
+ out[idx] = f (THRUST_NS_QUALIFIER::raw_reference_cast (ins[idx])...); \
207
+ } \
208
+ }
209
+
210
+ if (tile_stride == tile_size)
211
+ {
212
+ PREFETCH_AGENT (true );
213
+ }
214
+ else
215
+ {
216
+ PREFETCH_AGENT (false );
217
+ }
218
+ #undef PREFETCH_AGENT
219
+ }
220
+
136
221
template <int BlockThreads>
137
222
struct async_copy_policy_t
138
223
{
@@ -543,8 +628,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
543
628
{
544
629
static constexpr int min_bif = arch_to_min_bytes_in_flight(300 );
545
630
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
546
- static constexpr auto algorithm = Algorithm::fallback_for ;
547
- using algo_policy = fallback_for_policy ;
631
+ static constexpr auto algorithm = Algorithm::prefetch ;
632
+ using algo_policy = prefetch_policy_t < 256 > ;
548
633
};
549
634
550
635
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
@@ -566,8 +651,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
566
651
567
652
static constexpr bool use_fallback =
568
653
RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
569
- static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
570
- using algo_policy = ::cuda::std::_If<use_fallback, fallback_for_policy , async_policy>;
654
+ static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
655
+ using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t < 256 > , async_policy>;
571
656
};
572
657
573
658
using max_policy = policy900;
@@ -828,6 +913,38 @@ struct dispatch_t<RequiresStableAddress,
828
913
make_iterator_kernel_arg (THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator (::cuda::std::get<Is>(in)))...));
829
914
}
830
915
916
+ template <typename ActivePolicy, std::size_t ... Is>
917
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
918
+ invoke_algorithm (cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>)
919
+ {
920
+ using policy_t = typename ActivePolicy::algo_policy;
921
+ constexpr int block_dim = policy_t ::block_threads;
922
+ int max_occupancy = 0 ;
923
+ const auto error = CubDebug (MaxSmOccupancy (max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0 ));
924
+ if (error != cudaSuccess)
925
+ {
926
+ return error;
927
+ }
928
+
929
+ const int items_per_thread =
930
+ loaded_bytes_per_iter == 0
931
+ ? +policy_t ::items_per_thread_no_input
932
+ : ::cuda::ceil_div (ActivePolicy::min_bif, max_occupancy * block_dim * loaded_bytes_per_iter);
933
+ const int items_per_thread_clamped =
934
+ ::cuda::std::clamp (items_per_thread, +policy_t ::min_items_per_thread, +policy_t ::max_items_per_thread);
935
+ const int tile_size = block_dim * items_per_thread_clamped;
936
+ const auto grid_dim = static_cast <unsigned int >(::cuda::ceil_div (num_items, Offset{tile_size}));
937
+ return CubDebug (
938
+ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron (grid_dim, block_dim, 0 , stream)
939
+ .doit (
940
+ CUB_DETAIL_TRANSFORM_KERNEL_PTR,
941
+ num_items,
942
+ items_per_thread_clamped,
943
+ op,
944
+ out,
945
+ make_iterator_kernel_arg (THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator (::cuda::std::get<Is>(in)))...));
946
+ }
947
+
831
948
template <typename ActivePolicy>
832
949
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke ()
833
950
{
0 commit comments