@@ -89,6 +89,7 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
89
89
enum class Algorithm
90
90
{
91
91
fallback_for,
92
+ prefetch,
92
93
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
93
94
ublkcp,
94
95
#endif // _CUB_HAS_TRANSFORM_UBLKCP
@@ -133,6 +134,116 @@ _CCCL_DEVICE void transform_kernel_impl(
133
134
}
134
135
}
135
136
137
+ template <typename T>
138
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char * round_down_ptr (const T* ptr, unsigned alignment)
139
+ {
140
+ #if _CCCL_STD_VER > 2011
141
+ _CCCL_ASSERT (::cuda::std::has_single_bit (alignment), " " );
142
+ #endif // _CCCL_STD_VER > 2011
143
+ return reinterpret_cast <const char *>(
144
+ reinterpret_cast <::cuda::std::uintptr_t >(ptr) & ~::cuda::std::uintptr_t {alignment - 1 });
145
+ }
146
+
147
+ template <int BlockThreads>
148
+ struct prefetch_policy_t
149
+ {
150
+ static constexpr int block_threads = BlockThreads;
151
+ // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
152
+ static constexpr int items_per_thread_no_input = 2 ; // when there are no input iterators, the kernel is just filling
153
+ static constexpr int min_items_per_thread = 1 ;
154
+ static constexpr int max_items_per_thread = 32 ;
155
+ };
156
+
157
+ // Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
158
+ // TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
159
+ // as we need to create less instructions to prefetch the same amount of data.
160
+ template <typename T>
161
+ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch (const T* addr)
162
+ {
163
+ // TODO(bgruber): prefetch to L1 may be even better
164
+ asm volatile (" prefetch.global.L2 [%0];" : : " l" (__cvta_generic_to_global (addr)) : " memory" );
165
+ }
166
+
167
+ template <int BlockDim, typename T>
168
+ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile (const T* addr, int tile_size)
169
+ {
170
+ constexpr int prefetch_byte_stride = 128 ; // TODO(bgruber): should correspond to cache line size. Does this need to be
171
+ // architecture dependent?
172
+ const int tile_size_bytes = tile_size * sizeof (T);
173
+ // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
174
+ #pragma unroll 1
175
+ for (int offset = threadIdx .x * prefetch_byte_stride; offset < tile_size_bytes;
176
+ offset += BlockDim * prefetch_byte_stride)
177
+ {
178
+ prefetch (reinterpret_cast <const char *>(addr) + offset);
179
+ }
180
+ }
181
+
182
+ // TODO(miscco): we should probably constrain It to not be a contiguous iterator in C++17 (and change the overload
183
+ // above to accept any contiguous iterator)
184
+ // overload for any iterator that is not a pointer, do nothing
185
+ template <int , typename It, ::cuda::std::__enable_if_t <!::cuda::std::is_pointer<It>::value, int > = 0 >
186
+ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile (It, int )
187
+ {}
188
+
189
+ // This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
190
+ // global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
191
+ // parameter yields a global memory address.
192
+ template <typename PrefetchPolicy,
193
+ typename Offset,
194
+ typename F,
195
+ typename RandomAccessIteratorOut,
196
+ typename ... RandomAccessIteratorIn>
197
+ _CCCL_DEVICE void transform_kernel_impl (
198
+ ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
199
+ Offset num_items,
200
+ int num_elem_per_thread,
201
+ F f,
202
+ RandomAccessIteratorOut out,
203
+ RandomAccessIteratorIn... ins)
204
+ {
205
+ constexpr int block_dim = PrefetchPolicy::block_threads;
206
+ const int tile_stride = block_dim * num_elem_per_thread;
207
+ const Offset offset = static_cast <Offset>(blockIdx .x ) * tile_stride;
208
+ const int tile_size = static_cast <int >(::cuda::std::min (num_items - offset, Offset{tile_stride}));
209
+
210
+ // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
211
+ {
212
+ int dummy[] = {(ins += offset, 0 )..., 0 };
213
+ (void ) &dummy;
214
+ out += offset;
215
+ }
216
+
217
+ {
218
+ // TODO(bgruber): replace by fold over comma in C++17
219
+ int dummy[] = {(prefetch_tile<block_dim>(ins, tile_size), 0 )..., 0 }; // extra zero to handle empty packs
220
+ (void ) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
221
+ }
222
+
223
+ #define PREFETCH_AGENT (full_tile ) \
224
+ /* ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost */ \
225
+ /* bgruber: but A6000 and H100 show small gains without pragma */ \
226
+ /* _Pragma("unroll 1")*/ for (int j = 0 ; j < num_elem_per_thread; ++j) \
227
+ { \
228
+ const int idx = j * block_dim + threadIdx .x ; \
229
+ if (full_tile || idx < tile_size) \
230
+ { \
231
+ /* we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) */ \
232
+ out[idx] = f (THRUST_NS_QUALIFIER::raw_reference_cast (ins[idx])...); \
233
+ } \
234
+ }
235
+
236
+ if (tile_stride == tile_size)
237
+ {
238
+ PREFETCH_AGENT (true );
239
+ }
240
+ else
241
+ {
242
+ PREFETCH_AGENT (false );
243
+ }
244
+ #undef PREFETCH_AGENT
245
+ }
246
+
136
247
template <int BlockThreads>
137
248
struct async_copy_policy_t
138
249
{
@@ -173,16 +284,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Inte
173
284
return (x + mult - 1 ) & ~(mult - 1 );
174
285
}
175
286
176
- template <typename T>
177
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char * round_down_ptr (const T* ptr, unsigned alignment)
178
- {
179
- #if _CCCL_STD_VER > 2011
180
- _CCCL_ASSERT (::cuda::std::has_single_bit (alignment), " " );
181
- #endif // _CCCL_STD_VER > 2011
182
- return reinterpret_cast <const char *>(
183
- reinterpret_cast <::cuda::std::uintptr_t >(ptr) & ~::cuda::std::uintptr_t {alignment - 1 });
184
- }
185
-
186
287
// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
187
288
//
188
289
// For performance considerations of memcpy_async:
@@ -543,8 +644,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
543
644
{
544
645
static constexpr int min_bif = arch_to_min_bytes_in_flight(300 );
545
646
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
546
- static constexpr auto algorithm = Algorithm::fallback_for ;
547
- using algo_policy = fallback_for_policy ;
647
+ static constexpr auto algorithm = Algorithm::prefetch ;
648
+ using algo_policy = prefetch_policy_t < 256 > ;
548
649
};
549
650
550
651
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
@@ -566,8 +667,8 @@ struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIterator
566
667
567
668
static constexpr bool use_fallback =
568
669
RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
569
- static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
570
- using algo_policy = ::cuda::std::_If<use_fallback, fallback_for_policy , async_policy>;
670
+ static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
671
+ using algo_policy = ::cuda::std::_If<use_fallback, prefetch_policy_t < 256 > , async_policy>;
571
672
};
572
673
573
674
using max_policy = policy900;
@@ -647,13 +748,38 @@ _CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
647
748
return max_smem;
648
749
}
649
750
751
+ _CCCL_HOST_DEVICE inline PoorExpected<int > get_sm_count ()
752
+ {
753
+ int device = 0 ;
754
+ auto error = CubDebug (cudaGetDevice (&device));
755
+ if (error != cudaSuccess)
756
+ {
757
+ return error;
758
+ }
759
+
760
+ int sm_count = 0 ;
761
+ error = CubDebug (cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device));
762
+ if (error != cudaSuccess)
763
+ {
764
+ return error;
765
+ }
766
+
767
+ return sm_count;
768
+ }
769
+
650
770
struct elem_counts
651
771
{
652
772
int elem_per_thread;
653
773
int tile_size;
654
774
int smem_size;
655
775
};
656
776
777
+ struct prefetch_config
778
+ {
779
+ int max_occupancy;
780
+ int sm_count;
781
+ };
782
+
657
783
template <bool RequiresStableAddress,
658
784
typename Offset,
659
785
typename RandomAccessIteratorTupleIn,
@@ -758,15 +884,11 @@ struct dispatch_t<RequiresStableAddress,
758
884
return last_counts;
759
885
};
760
886
PoorExpected<elem_counts> config = [&]() {
761
- NV_IF_TARGET (
762
- NV_IS_HOST,
763
- (
764
- // this static variable exists for each template instantiation of the surrounding function and class, on which
765
- // the chosen element count solely depends (assuming max SMEM is constant during a program execution)
766
- static auto cached_config = determine_element_counts (); return cached_config;),
767
- (
768
- // we cannot cache the determined element count in device code
769
- return determine_element_counts ();));
887
+ NV_IF_TARGET (NV_IS_HOST,
888
+ (static auto cached_config = determine_element_counts (); return cached_config;),
889
+ (
890
+ // we cannot cache the determined element count in device code
891
+ return determine_element_counts ();));
770
892
}();
771
893
if (!config)
772
894
{
@@ -828,6 +950,68 @@ struct dispatch_t<RequiresStableAddress,
828
950
make_iterator_kernel_arg (THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator (::cuda::std::get<Is>(in)))...));
829
951
}
830
952
953
+ template <typename ActivePolicy, std::size_t ... Is>
954
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
955
+ invoke_algorithm (cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>)
956
+ {
957
+ using policy_t = typename ActivePolicy::algo_policy;
958
+ constexpr int block_dim = policy_t ::block_threads;
959
+
960
+ auto determine_config = [&]() -> PoorExpected<prefetch_config> {
961
+ int max_occupancy = 0 ;
962
+ const auto error = CubDebug (MaxSmOccupancy (max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0 ));
963
+ if (error != cudaSuccess)
964
+ {
965
+ return error;
966
+ }
967
+ const auto sm_count = get_sm_count ();
968
+ if (!sm_count)
969
+ {
970
+ return sm_count.error ;
971
+ }
972
+ return prefetch_config{max_occupancy, *sm_count};
973
+ };
974
+
975
+ PoorExpected<prefetch_config> config = [&]() {
976
+ NV_IF_TARGET (
977
+ NV_IS_HOST,
978
+ (
979
+ // this static variable exists for each template instantiation of the surrounding function and class, on which
980
+ // the chosen element count solely depends (assuming max SMEM is constant during a program execution)
981
+ static auto cached_config = determine_config (); return cached_config;),
982
+ (
983
+ // we cannot cache the determined element count in device code
984
+ return determine_config ();));
985
+ }();
986
+ if (!config)
987
+ {
988
+ return config.error ;
989
+ }
990
+
991
+ const int items_per_thread =
992
+ loaded_bytes_per_iter == 0
993
+ ? +policy_t ::items_per_thread_no_input
994
+ : ::cuda::ceil_div (ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);
995
+
996
+ // Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
997
+ const int items_per_thread_evenly_spread =
998
+ static_cast <int >(::cuda::std::min (Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
999
+
1000
+ const int items_per_thread_clamped = ::cuda::std::clamp (
1001
+ items_per_thread_evenly_spread, +policy_t ::min_items_per_thread, +policy_t ::max_items_per_thread);
1002
+ const int tile_size = block_dim * items_per_thread_clamped;
1003
+ const auto grid_dim = static_cast <unsigned int >(::cuda::ceil_div (num_items, Offset{tile_size}));
1004
+ return CubDebug (
1005
+ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron (grid_dim, block_dim, 0 , stream)
1006
+ .doit (
1007
+ CUB_DETAIL_TRANSFORM_KERNEL_PTR,
1008
+ num_items,
1009
+ items_per_thread_clamped,
1010
+ op,
1011
+ out,
1012
+ make_iterator_kernel_arg (THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator (::cuda::std::get<Is>(in)))...));
1013
+ }
1014
+
831
1015
template <typename ActivePolicy>
832
1016
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke ()
833
1017
{
0 commit comments