diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 96d6ee31fb2..606a6151dc7 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -74,28 +74,24 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr) asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory"); } -template -_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(const T* addr, int tile_size) +template +_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int tile_size) { - constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to be - // architecture dependent? - const int tile_size_bytes = tile_size * sizeof(T); - // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling -#pragma unroll 1 - for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes; - offset += BlockDim * prefetch_byte_stride) + if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v) { - prefetch(reinterpret_cast(addr) + offset); + constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to + // be architecture dependent? + const int tile_size_bytes = tile_size * sizeof(value_t); + // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling +#pragma unroll 1 + for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes; + offset += BlockDim * prefetch_byte_stride) + { + prefetch(reinterpret_cast(::cuda::std::to_address(begin)) + offset); + } } } -// TODO(miscco): we should probably constrain It to not be a contiguous iterator in C++17 (and change the overload -// above to accept any contiguous iterator) -// overload for any iterator that is not a pointer, do nothing -template , int> = 0> -_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It, int) -{} - // This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in // global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the // parameter yields a global memory address. @@ -119,20 +115,13 @@ _CCCL_DEVICE void transform_kernel_impl( // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below { - int dummy[] = {(ins += offset, 0)..., 0}; - (void) &dummy; + (..., (ins += offset)); out += offset; } - { - // TODO(bgruber): replace by fold over comma in C++17 - // extra zero at the end handles empty packs - int dummy[] = {(prefetch_tile(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size), 0)..., 0}; - (void) &dummy; // nvcc 11.1 needs extra strong unused warning suppression - } + (..., prefetch_tile(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size)); - // TODO(bgruber): use `auto full_tile` and pass true_type/false_type in C++14 to strengthen the compile-time intent - auto process_tile = [&](bool full_tile) { + auto process_tile = [&](auto full_tile, auto... ins2 /* nvcc fails to compile when just using the captured ins */) { // ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost // bgruber: but A6000 and H100 show small gains without pragma //_Pragma("unroll 1") @@ -142,41 +131,20 @@ _CCCL_DEVICE void transform_kernel_impl( if (full_tile || idx < tile_size) { // we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) - out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...); + out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins2[idx])...); } } }; - // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice if (tile_stride == tile_size) { - process_tile(true); + process_tile(::cuda::std::true_type{}, ins...); } else { - process_tile(false); + process_tile(::cuda::std::false_type{}, ins...); } } -// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17. -template -_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence) - -> decltype(::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...)) -{ - return ::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...); -} - -template -_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl( - ::cuda::std::forward(f), - ::cuda::std::forward(t), - ::cuda::std::make_index_sequence<::cuda::std::tuple_size_v<::cuda::std::remove_reference_t>>{})) -{ - return poor_apply_impl( - ::cuda::std::forward(f), - ::cuda::std::forward(t), - ::cuda::std::make_index_sequence<::cuda::std::tuple_size_v<::cuda::std::remove_reference_t>>{}); -} - // Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding // // For performance considerations of memcpy_async: @@ -254,35 +222,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one() return threadIdx.x < 32 && static_cast(is_elected); } -// TODO(bgruber): inline this as lambda in C++14 -template -_CCCL_DEVICE void bulk_copy_tile( - ::cuda::std::uint64_t& bar, - int tile_stride, - char* smem, - int& smem_offset, - ::cuda::std::uint32_t& total_bytes_bulk_copied, - Offset global_offset, - const aligned_base_ptr& aligned_ptr) -{ - static_assert(alignof(T) <= bulk_copy_alignment, ""); - - const char* src = aligned_ptr.ptr + global_offset * sizeof(T); - char* dst = smem + smem_offset; - _CCCL_ASSERT(reinterpret_cast(src) % bulk_copy_alignment == 0, ""); - _CCCL_ASSERT(reinterpret_cast(dst) % bulk_copy_alignment == 0, ""); - - // TODO(bgruber): we could precompute bytes_to_copy on the host - const int bytes_to_copy = round_up_to_po2_multiple( - aligned_ptr.head_padding + static_cast(sizeof(T)) * tile_stride, bulk_copy_size_multiple); - - ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar); - total_bytes_bulk_copied += bytes_to_copy; - - // add bulk_copy_alignment to make space for the next tile's head padding - smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; -} - template _CCCL_DEVICE void bulk_copy_tile_fallback( int tile_size, @@ -304,16 +243,6 @@ _CCCL_DEVICE void bulk_copy_tile_fallback( smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; } -// TODO(bgruber): inline this as lambda in C++14 -template -_CCCL_DEVICE _CCCL_FORCEINLINE const T& -fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr& aligned_ptr) -{ - const T* smem_operand_tile_base = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); - smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment; - return smem_operand_tile_base[smem_idx]; -} - template _CCCL_DEVICE void transform_kernel_ublkcp( Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr... aligned_ptrs) @@ -340,11 +269,28 @@ _CCCL_DEVICE void transform_kernel_ublkcp( int smem_offset = 0; ::cuda::std::uint32_t total_copied = 0; - // TODO(bgruber): use a fold over comma in C++17 + auto bulk_copy_tile = [&](auto aligned_ptr) { + using T = typename decltype(aligned_ptr)::value_type; + static_assert(alignof(T) <= bulk_copy_alignment, ""); + + const char* src = aligned_ptr.ptr + offset * sizeof(T); + char* dst = smem + smem_offset; + _CCCL_ASSERT(reinterpret_cast(src) % bulk_copy_alignment == 0, ""); + _CCCL_ASSERT(reinterpret_cast(dst) % bulk_copy_alignment == 0, ""); + + // TODO(bgruber): we could precompute bytes_to_copy on the host + const int bytes_to_copy = round_up_to_po2_multiple( + aligned_ptr.head_padding + static_cast(sizeof(T)) * tile_stride, bulk_copy_size_multiple); + + ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar); + total_copied += bytes_to_copy; + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; + }; + // Order of evaluation is left-to-right - int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)..., - 0}; - (void) dummy; + (..., bulk_copy_tile(aligned_ptrs)); // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied); @@ -360,10 +306,22 @@ _CCCL_DEVICE void transform_kernel_ublkcp( // use all threads to schedule an async_memcpy int smem_offset = 0; - // TODO(bgruber): use a fold over comma in C++17 + auto bulk_copy_tile_fallback = [&](auto aligned_ptr) { + using T = typename decltype(aligned_ptr)::value_type; + const T* src = aligned_ptr.ptr_to_elements() + offset; + T* dst = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + _CCCL_ASSERT(reinterpret_cast(src) % alignof(T) == 0, ""); + _CCCL_ASSERT(reinterpret_cast(dst) % alignof(T) == 0, ""); + + const int bytes_to_copy = static_cast(sizeof(T)) * tile_size; + cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy); + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; + }; + // Order of evaluation is left-to-right - int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0}; - (void) dummy; + (..., bulk_copy_tile_fallback(aligned_ptrs)); cooperative_groups::wait(cooperative_groups::this_thread_block()); } @@ -371,8 +329,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp( // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below out += offset; - // TODO(bgruber): use `auto full_tile` and pass true_type/false_type in C++14 to strengthen the compile-time intent - auto process_tile = [&](bool full_tile) { + auto process_tile = [&](auto full_tile) { // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark) _CCCL_PRAGMA(unroll 1) for (int j = 0; j < num_elem_per_thread; ++j) @@ -380,24 +337,31 @@ _CCCL_DEVICE void transform_kernel_ublkcp( const int idx = j * block_dim + threadIdx.x; if (full_tile || idx < tile_size) { - int smem_offset = 0; + int smem_offset = 0; + auto fetch_operand = [&](auto aligned_ptr) { + using T = typename decltype(aligned_ptr)::value_type; + const T* smem_operand_tile_base = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment; + return smem_operand_tile_base[idx]; + }; + // need to expand into a tuple for guaranteed order of evaluation - out[idx] = poor_apply( - [&](const InTs&... values) { + out[idx] = ::cuda::std::apply( + [&](auto... values) { return f(values...); }, - ::cuda::std::tuple{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...}); + ::cuda::std::tuple{fetch_operand(aligned_ptrs)...}); } } }; // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice if (tile_stride == tile_size) { - process_tile(true); + process_tile(::cuda::std::true_type{}); } else { - process_tile(false); + process_tile(::cuda::std::false_type{}); } } @@ -458,30 +422,26 @@ _CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) - return arg; } -// TODO(bgruber): make a variable template in C++14 template -using needs_aligned_ptr_t = - ::cuda::std::bool_constant; + ; -#ifdef _CUB_HAS_TRANSFORM_UBLKCP -template ::value, int> = 0> +template _CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) - -> aligned_base_ptr>&& { - return ::cuda::std::move(arg.aligned_ptr); -} +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + if constexpr (needs_aligned_ptr_v) + { + return ::cuda::std::move(arg.aligned_ptr); + } + else #endif // _CUB_HAS_TRANSFORM_UBLKCP - -template ::value, int> = 0> -_CCCL_DEVICE _CCCL_FORCEINLINE auto -select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) -> It&& -{ - return ::cuda::std::move(arg.iterator); + return ::cuda::std::move(arg.iterator); } // There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated @@ -510,57 +470,11 @@ __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads) select_kernel_arg(alg, ::cuda::std::move(ins))...); } -// TODO(bgruber): replace by ::cuda::std::expected in C++14 template -struct PoorExpected -{ - alignas(T) char storage[sizeof(T)]; - cudaError_t error; - - _CCCL_HOST_DEVICE PoorExpected(T value) - : error(cudaSuccess) - { - new (storage) T(::cuda::std::move(value)); - } - - _CCCL_HOST_DEVICE PoorExpected(cudaError_t error) - : error(error) - {} - - _CCCL_HOST_DEVICE explicit operator bool() const - { - return error == cudaSuccess; - } - - _CCCL_HOST_DEVICE T& operator*() - { - _CCCL_DIAG_PUSH - _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") - return reinterpret_cast(storage); - _CCCL_DIAG_POP - } - - _CCCL_HOST_DEVICE const T& operator*() const - { - _CCCL_DIAG_PUSH - _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") - return reinterpret_cast(storage); - _CCCL_DIAG_POP - } - - _CCCL_HOST_DEVICE T* operator->() - { - return &**this; - } - - _CCCL_HOST_DEVICE const T* operator->() const - { - return &**this; - } -}; +using cuda_expected = ::cuda::std::expected; // TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this. -_CCCL_HOST_DEVICE inline PoorExpected get_max_shared_memory() +_CCCL_HOST_DEVICE inline cuda_expected get_max_shared_memory() { // gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels // will later be launched on) belongs to the currently active device. So we can just query the active device here. @@ -581,7 +495,7 @@ _CCCL_HOST_DEVICE inline PoorExpected get_max_shared_memory() return max_smem; } -_CCCL_HOST_DEVICE inline PoorExpected get_sm_count() +_CCCL_HOST_DEVICE inline cuda_expected get_sm_count() { int device = 0; auto error = CubDebug(cudaGetDevice(&device)); @@ -657,7 +571,7 @@ struct dispatch_t - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected< + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> cuda_expected< ::cuda::std:: tuple> { @@ -667,11 +581,11 @@ struct dispatch_t PoorExpected { + auto determine_element_counts = [&]() -> cuda_expected { const auto max_smem = get_max_shared_memory(); if (!max_smem) { - return max_smem.error; + return ::cuda::std::unexpected(max_smem.error()); } elem_counts last_counts{}; @@ -700,7 +614,7 @@ struct dispatch_t(error); } const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter; @@ -713,7 +627,7 @@ struct dispatch_t config = [&]() { + cuda_expected config = [&]() { NV_IF_TARGET(NV_IS_HOST, (static auto cached_config = determine_element_counts(); return cached_config;), ( @@ -722,7 +636,7 @@ struct dispatch_t(config.error()); } _CCCL_ASSERT_HOST(config->elem_per_thread > 0, ""); _CCCL_ASSERT_HOST(config->tile_size > 0, ""); @@ -743,15 +657,13 @@ struct dispatch_t(); if (!ret) { - return ret.error; + return ret.error(); } - // TODO(bgruber): use a structured binding in C++17 - // auto [launcher, kernel, elem_per_thread] = *ret; - - return ::cuda::std::get<0>(*ret).doit( - ::cuda::std::get<1>(*ret), + auto [launcher, kernel, elem_per_thread] = *ret; + return launcher.doit( + kernel, num_items, - ::cuda::std::get<2>(*ret), + elem_per_thread, op, out, make_aligned_base_ptr_kernel_arg( @@ -766,22 +678,22 @@ struct dispatch_t PoorExpected { + auto determine_config = [&]() -> cuda_expected { int max_occupancy = 0; const auto error = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0)); if (error != cudaSuccess) { - return error; + return ::cuda::std::unexpected(error); } const auto sm_count = get_sm_count(); if (!sm_count) { - return sm_count.error; + return ::cuda::std::unexpected(sm_count.error()); } return prefetch_config{max_occupancy, *sm_count}; }; - PoorExpected config = [&]() { + cuda_expected config = [&]() { NV_IF_TARGET( NV_IS_HOST, ( @@ -794,7 +706,7 @@ struct dispatch_t