From 502daa656b1426e4377ccc9dcd80526fae0bd67d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 06:49:49 -0500 Subject: [PATCH 01/27] Make thread_store.cuh NVRTC compilable --- cub/cub/thread/thread_store.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index 0fb29526cad..d4859c2c174 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //----------------------------------------------------------------------------- From 3022bd9bd310c492fa548570db05a2ca38d02674 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 06:51:20 -0500 Subject: [PATCH 02/27] Get TileState from KernelSource --- cub/cub/device/dispatch/dispatch_scan.cuh | 31 ++++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 18b4c4f61c7..93f36dc3b7f 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -83,7 +83,7 @@ struct DeviceScanKernelSource { using ScanTileStateT = typename cub::ScanTileState; - CUB_DEFINE_KERNEL_GETTER(ScanInitKernel, DeviceScanInitKernel) + CUB_DEFINE_KERNEL_GETTER(InitKernel, DeviceScanInitKernel) CUB_DEFINE_KERNEL_GETTER( ScanKernel, @@ -101,6 +101,11 @@ struct DeviceScanKernelSource { return sizeof(AccumT); } + + CUB_RUNTIME_FUNCTION ScanTileStateT TileState() + { + return ScanTileStateT(); + } }; } // namespace detail::scan @@ -165,9 +170,6 @@ struct DispatchScan static constexpr int INIT_KERNEL_THREADS = 128; - // The input value type - using InputT = cub::detail::value_t; - /// Device-accessible allocation of temporary storage. When nullptr, the /// required allocation size is written to \p temp_storage_bytes and no work /// is done. @@ -258,13 +260,12 @@ struct DispatchScan CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {}) { - using ScanTileStateT = typename KernelSource::ScanTileStateT; - + // TODO(ashwin): Does this now need to be a runtime check? // `LOAD_LDG` makes in-place execution UB and doesn't lead to better // performance. - static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG, - "The memory consistency model does not apply to texture " - "accesses"); + // static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG, + // "The memory consistency model does not apply to texture " + // "accesses"); cudaError error = cudaSuccess; do @@ -281,9 +282,12 @@ struct DispatchScan int tile_size = policy.Scan().BlockThreads() * policy.Scan().ItemsPerThread(); int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + auto tile_state = kernel_source.TileState(); + // Specify temporary storage allocation requirements size_t allocation_sizes[1]; - error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + + error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors @@ -313,7 +317,6 @@ struct DispatchScan } // Construct the tile status interface - ScanTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { @@ -397,17 +400,15 @@ struct DispatchScan } } } while (0); - return error; } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {}) { - using ScanTileStateT = typename KernelSource::ScanTileStateT; - auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy); + auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy); // Ensure kernels are instantiated. - return Invoke(kernel_source.ScanInitKernel(), kernel_source.ScanKernel(), wrapped_policy); + return Invoke(kernel_source.InitKernel(), kernel_source.ScanKernel(), wrapped_policy); } /** From 418a76340d35c1cb5219a44ff0742a52a5154e3d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 06:51:52 -0500 Subject: [PATCH 03/27] Use launcher_factory to get the SM occupancy, PTX version. Implement MaxgridDimX --- cub/cub/detail/launcher/cuda_driver.cuh | 5 +++++ cub/cub/device/dispatch/dispatch_scan.cuh | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh index 52c643f9707..66e2e8a87a9 100644 --- a/cub/cub/detail/launcher/cuda_driver.cuh +++ b/cub/cub/detail/launcher/cuda_driver.cuh @@ -103,6 +103,11 @@ struct CudaDriverLauncherFactory cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_fn, block_size, dynamic_smem_bytes)); } + _CCCL_HIDE_FROM_ABI cudaError_t MaxGridDimX(int& max_grid_dim_x) const + { + return static_cast(cuDeviceGetAttribute(&max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device)); + } + CUdevice device; int cc; }; diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 93f36dc3b7f..e56f3f74dcd 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -349,9 +349,10 @@ struct DispatchScan // Get SM occupancy for scan_kernel int scan_sm_occupancy; - error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out - scan_kernel, - policy.Scan().BlockThreads())); + error = CubDebug(launcher_factory.MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + policy.Scan().BlockThreads())); if (cudaSuccess != error) { break; @@ -452,20 +453,20 @@ struct DispatchScan InitValueT init_value, OffsetT num_items, cudaStream_t stream, - KernelSource kernel_source = {}, - MaxPolicyT max_policy = {}) + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + MaxPolicyT max_policy = {}) { cudaError_t error; do { // Get PTX version int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); + error = CubDebug(launcher_factory.PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } - // Create dispatch functor DispatchScan dispatch( d_temp_storage, @@ -477,7 +478,8 @@ struct DispatchScan init_value, stream, ptx_version, - kernel_source); + kernel_source, + launcher_factory); // Dispatch to chained policy error = CubDebug(max_policy.Invoke(ptx_version, dispatch)); From 6b386058a6c7a53d23ca541fc62e8e8b05935a5f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 06:54:02 -0500 Subject: [PATCH 04/27] Put reduce stuff inside `reduce` namespace --- c/parallel/src/reduce.cu | 110 +++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index da866bd6a76..f39c8a9f944 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -40,6 +40,12 @@ static_assert(std::is_same_v, OffsetT>, "O struct nothing_t {}; +struct input_iterator_state_t; +struct output_iterator_t; + +namespace reduce +{ + struct reduce_runtime_tuning_policy { int block_size; @@ -107,9 +113,6 @@ static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*i return init.type; } -struct input_iterator_state_t; -struct output_iterator_t; - std::string get_input_iterator_name() { std::string iterator_t; @@ -201,6 +204,44 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it transform_op_t); } +template +struct dynamic_reduce_policy_t +{ + using MaxPolicy = dynamic_reduce_policy_t; + + template + cudaError_t Invoke(int device_ptx_version, F& op) + { + return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); + } + + cccl_type_info accumulator_type; +}; + +struct reduce_kernel_source +{ + cccl_device_reduce_build_result_t& build; + + std::size_t AccumSize() const + { + return build.accumulator_size; + } + CUkernel SingleTileKernel() const + { + return build.single_tile_kernel; + } + CUkernel SingleTileSecondKernel() const + { + return build.single_tile_second_kernel; + } + CUkernel ReductionKernel() const + { + return build.reduction_kernel; + } +}; + +} // namespace reduce + extern "C" CCCL_C_API CUresult cccl_device_reduce_build( cccl_device_reduce_build_result_t* build, cccl_iterator_t input_it, @@ -220,12 +261,12 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( { const char* name = "test"; - const int cc = cc_major * 10 + cc_minor; - const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); - const reduce_runtime_tuning_policy policy = get_policy(cc, accum_t); - const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); - const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); - const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init); + const auto policy = reduce::get_policy(cc, accum_t); + const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); + const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); const std::string input_iterator_src = make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it); @@ -271,9 +312,10 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( fflush(stdout); #endif - std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false); - std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true); - std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init); + std::string single_tile_kernel_name = reduce::get_single_tile_kernel_name(input_it, output_it, op, init, false); + std::string single_tile_second_kernel_name = + reduce::get_single_tile_kernel_name(input_it, output_it, op, init, true); + std::string reduction_kernel_name = reduce::get_device_reduce_kernel_name(op, input_it, init); std::string single_tile_kernel_lowered_name; std::string single_tile_second_kernel_lowered_name; std::string reduction_kernel_lowered_name; @@ -342,42 +384,6 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( return error; } -template -struct dynamic_reduce_policy_t -{ - using MaxPolicy = dynamic_reduce_policy_t; - - template - cudaError_t Invoke(int device_ptx_version, F& op) - { - return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); - } - - cccl_type_info accumulator_type; -}; - -struct reduce_kernel_source -{ - cccl_device_reduce_build_result_t& build; - - std::size_t AccumSize() const - { - return build.accumulator_size; - } - CUkernel SingleTileKernel() const - { - return build.single_tile_kernel; - } - CUkernel SingleTileSecondKernel() const - { - return build.single_tile_second_kernel; - } - CUkernel ReductionKernel() const - { - return build.reduction_kernel; - } -}; - extern "C" CCCL_C_API CUresult cccl_device_reduce( cccl_device_reduce_build_result_t build, void* d_temp_storage, @@ -404,9 +410,9 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( indirect_arg_t, indirect_arg_t, void, - dynamic_reduce_policy_t<&get_policy>, + reduce::dynamic_reduce_policy_t<&reduce::get_policy>, ::cuda::std::__identity, - reduce_kernel_source, + reduce::reduce_kernel_source, cub::detail::CudaDriverLauncherFactory>:: Dispatch( d_temp_storage, @@ -420,7 +426,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( {}, {build}, cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - {get_accumulator_type(op, d_in, init)}); + {reduce::get_accumulator_type(op, d_in, init)}); } catch (const std::exception& exc) { @@ -439,7 +445,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( return error; } -extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept { try { From 6365d5ca5a53ff11136553ea0cbf925f9b95e5aa Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 07:25:09 -0500 Subject: [PATCH 05/27] Handle PTX compilation in command list --- c/parallel/src/nvrtc/command_list.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h index 77d83e86d47..719d9c5bf88 100644 --- a/c/parallel/src/nvrtc/command_list.h +++ b/c/parallel/src/nvrtc/command_list.h @@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor check(jitlink_error); - check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size)); + bool ptx = false; + auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size); + if (result != NVJITLINK_SUCCESS) + { + ptx = true; + result = nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size); + } cleanup.cubin_ref.cubin = std::unique_ptr(new char[cleanup.cubin_ref.size]); - check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get())); + + if (ptx) + { + check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get())); + } + else + { + check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get())); + } } }; From f3454c58a50f7eb7857ad53455bd5bbf3e06fb57 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 07:25:36 -0500 Subject: [PATCH 06/27] Missing noexcept --- c/parallel/include/cccl/c/reduce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/parallel/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h index f1e875087a7..2366024a532 100644 --- a/c/parallel/include/cccl/c/reduce.h +++ b/c/parallel/include/cccl/c/reduce.h @@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( cccl_value_t init, CUstream stream) noexcept; -extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr); +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept; From 3fe0dedeecd2bc13fce0664ed6ee5646ed105907 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 11:21:00 -0500 Subject: [PATCH 07/27] Allow passing InitValueT without wrapping in InputValue --- cub/cub/device/dispatch/kernels/scan.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh index cc3034638bc..7cb71ca49e7 100644 --- a/cub/cub/device/dispatch/kernels/scan.cuh +++ b/cub/cub/device/dispatch/kernels/scan.cuh @@ -40,6 +40,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -159,7 +161,9 @@ template + bool ForceInclusive, + typename RealInitValueT = ::cuda::std:: + _If<::cuda::std::is_void_v, InitValueT, typename InitValueT::value_type>> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( InputIteratorT d_in, @@ -170,8 +174,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) InitValueT init_value, OffsetT num_items) { - using RealInitValueT = typename InitValueT::value_type; - using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; + using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; // Thread block type for scanning input tiles using AgentScanT = detail::scan:: From e4a1aeaf3bb975a5168de95fa09bf05e1f13ec09 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 07:26:17 -0500 Subject: [PATCH 08/27] Add scan c.parallel API --- c/parallel/include/cccl/c/scan.h | 57 +++ c/parallel/src/scan.cu | 585 +++++++++++++++++++++++++++++++ 2 files changed, 642 insertions(+) create mode 100644 c/parallel/include/cccl/c/scan.h create mode 100644 c/parallel/src/scan.cu diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h new file mode 100644 index 00000000000..df9f09588a5 --- /dev/null +++ b/c/parallel/include/cccl/c/scan.h @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA Core Compute Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL + +#include + +#include + +struct cccl_device_scan_build_result_t +{ + int cc; + void* cubin; + size_t cubin_size; + CUlibrary library; + cccl_type_info accumulator_type; + CUkernel init_kernel; + CUkernel scan_kernel; + void* tile_state; +}; + +extern "C" CCCL_C_API CUresult cccl_device_scan_build( + cccl_device_scan_build_result_t* build, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_scan( + cccl_device_scan_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept; diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu new file mode 100644 index 00000000000..2c259cbcd27 --- /dev/null +++ b/c/parallel/src/scan.cu @@ -0,0 +1,585 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "cub/util_device.cuh" +#include "kernels/iterators.h" +#include "kernels/operators.h" +#include "util/context.h" +#include "util/errors.h" +#include "util/indirect_arg.h" +#include "util/types.h" +#include +#include +#include + +struct op_wrapper; +struct device_scan_policy; +using OffsetT = unsigned long long; +static_assert(std::is_same_v, OffsetT>, "OffsetT must be size_t"); + +struct input_iterator_state_t; +struct output_iterator_t; + +namespace scan +{ + +struct scan_runtime_tuning_policy +{ + int block_size; + int items_per_thread; + cub::CacheLoadModifier load_modifier; + + scan_runtime_tuning_policy Scan() const + { + return *this; + } + + int ItemsPerThread() const + { + return items_per_thread; + } + + int BlockThreads() const + { + return block_size; + } + + cub::CacheLoadModifier LoadModifier() const + { + return load_modifier; + } +}; + +template +Tuning find_tuning(int cc, const Tuning (&tunings)[N]) +{ + for (const Tuning& tuning : tunings) + { + if (cc >= tuning.cc) + { + return tuning; + } + } + + return tunings[N - 1]; +} + +scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/) +{ + // TODO(ashwin): returning some default for now: + return {128, 15, cub::LOAD_CA}; +} + +static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init) +{ + // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet + // so switching back to the old accumulator type logic for now + return init.type; +} + +std::string get_input_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_output_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string +get_init_kernel_name(cccl_iterator_t input_it, cccl_iterator_t /*output_it*/, cccl_op_t op, cccl_value_t init) +{ + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + return std::format("cub::detail::scan::DeviceScanInitKernel>", accum_cpp_t); +} + +std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + const std::string input_iterator_t = + (input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : scan::get_input_iterator_name()); + const std::string output_iterator_t = + output_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(output_it.value_type.type, true) // + : scan::get_output_iterator_name(); + const std::string init_t = cccl_type_enum_to_name(init.type.type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string scan_op_t; + check(nvrtcGetTypeName(&scan_op_t)); + + auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t); + return std::format( + "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>", + chained_policy_t, + input_iterator_t, + output_iterator_t, + tile_state_t, + scan_op_t, + init_t, + offset_t, + accum_cpp_t, + "false", // for now, always exclusive + init_t); +} + +size_t find_size_t(char* ptx, std::string_view name) +{ + std::regex regex(std::format(R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)", name)); + std::cmatch match; + if (std::regex_search(ptx, match, regex)) + { + auto result = std::stoi(match[1].str()); + return result; + } + else + { + throw std::runtime_error(std::format("Could not find {} in PTX code", name)); + } +} + +struct scan_tile_state +{ + // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except + // that it accepts the acummulator type as a runtime parameter rather than being + // templated on it. + // + // Both specializations ScanTileStateT and ScanTileStateT - where the + // bool parameter indicates whether `T` is primitive - are combined into a single type. + + void* d_tile_status; // d_tile_descriptors + void* d_tile_partial; + void* d_tile_inclusive; + + bool is_primitive; + size_t status_size; + size_t uninitialized_size; + + scan_tile_state(bool is_primitive, size_t status_size, size_t uninitialized_size) + : d_tile_status(nullptr) + , d_tile_partial(nullptr) + , d_tile_inclusive(nullptr) + , is_primitive(is_primitive) + , status_size(status_size) + , uninitialized_size(uninitialized_size) + {} + + cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) + { + return is_primitive ? InitPrimitive(num_tiles, d_temp_storage, temp_storage_bytes) + : InitStorage(num_tiles, d_temp_storage, temp_storage_bytes); + } + + cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const + { + return is_primitive ? AllocationSizePrimitive(num_tiles, temp_storage_bytes) + : AllocationSizeStorage(num_tiles, temp_storage_bytes); + } + + cudaError_t InitPrimitive(int, void* d_temp_storage, size_t) + { + d_tile_status = d_temp_storage; + return cudaSuccess; + } + + cudaError_t AllocationSizePrimitive(int num_tiles, size_t& d_temp_storage_bytes) const + { + d_temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * status_size; + return cudaSuccess; + } + + cudaError_t InitStorage(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) + { + cudaError_t error = cudaSuccess; + do + { + void* allocations[3] = {}; + size_t allocation_sizes[3]; + + // bytes needed for tile status descriptors + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size; + + // bytes needed for partials + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; + + // bytes needed for inclusives + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; + + // Compute allocation pointers into the single storage blob + error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + + if (cudaSuccess != error) + { + break; + } + + // Alias the offsets + d_tile_status = allocations[0]; + d_tile_partial = allocations[1]; + d_tile_inclusive = allocations[2]; + } while (0); + + return error; + } + + cudaError_t AllocationSizeStorage(int num_tiles, size_t& d_temp_storage_bytes) const + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + + // bytes needed for tile status descriptors + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size; + + // bytes needed for partials + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; + + // bytes needed for inclusives + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; + + // Set the necessary size of the blob + void* allocations[3] = {}; + return CubDebug(cub::AliasTemporaries(nullptr, d_temp_storage_bytes, allocations, allocation_sizes)); + } +}; + +template +struct dynamic_scan_policy_t +{ + using MaxPolicy = dynamic_scan_policy_t; + + template + cudaError_t Invoke(int device_ptx_version, F& op) + { + return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); + } + + cccl_type_info accumulator_type; +}; + +struct scan_kernel_source +{ + cccl_device_scan_build_result_t& build; + + std::size_t AccumSize() const + { + return build.accumulator_type.size; + } + CUkernel InitKernel() const + { + return build.init_kernel; + } + CUkernel ScanKernel() const + { + return build.scan_kernel; + } + scan_tile_state TileState() + { + auto result = (reinterpret_cast(build.tile_state)); + return *result; + } +}; + +} // namespace scan + +extern "C" CCCL_C_API CUresult cccl_device_scan_build( + cccl_device_scan_build_result_t* build, + cccl_iterator_t input_it, + cccl_iterator_t output_it, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept +{ + CUresult error = CUDA_SUCCESS; + + try + { + const char* name = "test"; + + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const auto policy = scan::get_policy(cc, accum_t); + const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); + const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + + const std::string input_iterator_src = make_kernel_input_iterator(offset_t, input_it_value_t, input_it); + const std::string output_iterator_src = make_kernel_output_iterator(offset_t, accum_cpp, output_it); + + const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op); + + const std::string src = std::format( + "#include \n" + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "{4}\n" + "{5}\n" + "struct agent_policy_t {{\n" + " static constexpr int ITEMS_PER_THREAD = {2};\n" + " static constexpr int BLOCK_THREADS = {3};\n" + " static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_WARP_TRANSPOSE;\n" + " static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;\n" + " static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = cub::BLOCK_STORE_WARP_TRANSPOSE;\n" + " static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS;\n" + " struct detail {{\n" + " using delay_constructor_t = cub::detail::default_delay_constructor_t<{7}>;\n" + " }};\n" + "}};\n" + "struct device_scan_policy {{\n" + " struct ActivePolicy {{\n" + " using ScanPolicyT = agent_policy_t;\n" + " }};\n" + "}};\n" + "{6};\n", + input_it.value_type.size, // 0 + input_it.value_type.alignment, // 1 + policy.items_per_thread, // 2 + policy.block_size, // 3 + input_iterator_src, // 4 + output_iterator_src, // 5 + op_src, // 6 + accum_cpp); // 7 + +#if false // CCCL_DEBUGGING_SWITCH + fflush(stderr); + printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str()); + fflush(stdout); +#endif + + std::string init_kernel_name = scan::get_init_kernel_name(input_it, output_it, op, init); + std::string scan_kernel_name = scan::get_scan_kernel_name(input_it, output_it, op, init); + std::string init_kernel_lowered_name; + std::string scan_kernel_lowered_name; + + const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor); + + constexpr size_t num_args = 7; + const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; + + constexpr size_t num_lto_args = 2; + const char* lopts[num_lto_args] = {"-lto", arch.c_str()}; + + // Collect all LTO-IRs to be linked. + nvrtc_ltoir_list ltoir_list; + auto ltoir_list_append = [<oir_list](nvrtc_ltoir lto) { + if (lto.ltsz) + { + ltoir_list.push_back(std::move(lto)); + } + }; + ltoir_list_append({op.ltoir, op.ltoir_size}); + if (cccl_iterator_kind_t::iterator == input_it.type) + { + ltoir_list_append({input_it.advance.ltoir, input_it.advance.ltoir_size}); + ltoir_list_append({input_it.dereference.ltoir, input_it.dereference.ltoir_size}); + } + if (cccl_iterator_kind_t::iterator == output_it.type) + { + ltoir_list_append({output_it.advance.ltoir, output_it.advance.ltoir_size}); + ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size}); + } + + nvrtc_cubin result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{src.c_str(), name}) + .add_expression({init_kernel_name}) + .add_expression({scan_kernel_name}) + .compile_program({args, num_args}) + .get_name({init_kernel_name, init_kernel_lowered_name}) + .get_name({scan_kernel_name, scan_kernel_lowered_name}) + .cleanup_program() + .add_link_list(ltoir_list) + .finalize_program(num_lto_args, lopts); + + cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str())); + check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str())); + + constexpr size_t num_ptx_args = 5; + const char* ptx_args[num_ptx_args] = {arch.c_str(), cub_path, libcudacxx_path, "-rdc=true", "-dlto"}; + constexpr size_t num_ptx_lto_args = 3; + const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"}; + + size_t status_size{0}; + size_t uninitialized_size{0}; + if (accum_t.type == cccl_type_enum::STORAGE) + { + std::string src = std::format( + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{2}>::StatusWord);\n" + "__device__ size_t uninitialized_size = sizeof(cub::Uninitialized<{2}>);\n", + accum_t.size, + accum_t.alignment, + accum_cpp); + auto compile_result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"}) + .compile_program({ptx_args, num_ptx_args}) + .cleanup_program() + .finalize_program(num_ptx_lto_args, ptx_lopts); + auto ptx_code = compile_result.cubin.get(); + status_size = scan::find_size_t(ptx_code, "status_size"); + uninitialized_size = scan::find_size_t(ptx_code, "uninitialized_size"); + } + else + { + std::string src = std::format( + "#include \n" + "#include \n" + "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{0}>::TxnWord);\n", + accum_cpp); + auto compile_result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"}) + .compile_program({ptx_args, num_ptx_args}) + .cleanup_program() + .finalize_program(num_ptx_lto_args, ptx_lopts); + auto ptx_code = compile_result.cubin.get(); + status_size = scan::find_size_t(ptx_code, "status_size"); + } + + bool is_primitive = not(accum_t.type == cccl_type_enum::STORAGE); + auto tile_state = std::make_unique(is_primitive, status_size, uninitialized_size); + + build->cc = cc; + build->cubin = (void*) result.cubin.release(); + build->cubin_size = result.size; + build->accumulator_type = accum_t; + build->tile_state = (void*) tile_state.release(); + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan_build(): %s\n", exc.what()); + fflush(stdout); + error = CUDA_ERROR_UNKNOWN; + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_scan( + cccl_device_scan_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept +{ + bool pushed = false; + CUresult error = CUDA_SUCCESS; + try + { + pushed = try_push_context(); + + CUdevice cu_device; + check(cuCtxGetDevice(&cu_device)); + auto cuda_error = cub::DispatchScan< + indirect_arg_t, + indirect_arg_t, + indirect_arg_t, + indirect_arg_t, + ::cuda::std::size_t, + void, + scan::dynamic_scan_policy_t<&scan::get_policy>, + false, + scan::scan_kernel_source, + cub::detail::CudaDriverLauncherFactory>:: + Dispatch( + d_temp_storage, + *temp_storage_bytes, + d_in, + d_out, + op, + init, + num_items, + stream, + {build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, + {scan::get_accumulator_type(op, d_in, init)}); + if (cuda_error != cudaSuccess) + { + const char* errorString = cudaGetErrorString(cuda_error); // Get the error string + std::cerr << "CUDA error: " << errorString << std::endl; + } + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan(): %s\n", exc.what()); + fflush(stdout); + error = CUDA_ERROR_UNKNOWN; + } + if (pushed) + { + CUcontext cu_context; + cuCtxPopCurrent(&cu_context); + } + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept +{ + try + { + if (bld_ptr == nullptr) + { + return CUDA_ERROR_INVALID_VALUE; + } + + std::unique_ptr cubin(reinterpret_cast(bld_ptr->cubin)); + check(cuLibraryUnload(bld_ptr->library)); + + std::unique_ptr tile_state(reinterpret_cast(bld_ptr->tile_state)); + tile_state.reset(); + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan_cleanup(): %s\n", exc.what()); + fflush(stdout); + return CUDA_ERROR_UNKNOWN; + } + + return CUDA_SUCCESS; +} From c953d6ed9dc4909798e7323741e8e6220fef6ba0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Jan 2025 07:26:34 -0500 Subject: [PATCH 09/27] Add tests for scan c.parallel API --- c/parallel/test/test_scan.cpp | 237 ++++++++++++++++++++++++++++++++++ c/parallel/test/test_util.h | 1 + 2 files changed, 238 insertions(+) create mode 100644 c/parallel/test/test_scan.cpp diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp new file mode 100644 index 00000000000..f7373416d74 --- /dev/null +++ b/c/parallel/test/test_scan.cpp @@ -0,0 +1,237 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +#include "test_util.h" + +void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int cc_major = deviceProp.major; + const int cc_minor = deviceProp.minor; + + const char* cub_path = TEST_CUB_PATH; + const char* thrust_path = TEST_THRUST_PATH; + const char* libcudacxx_path = TEST_LIBCUDACXX_PATH; + const char* ctk_path = TEST_CTK_PATH; + + cccl_device_scan_build_result_t build; + REQUIRE(CUDA_SUCCESS + == cccl_device_scan_build( + &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path)); + + const std::string sass = inspect_sass(build.cubin, build.cubin_size); + // TODO(ashwin): do we need the below? + // REQUIRE(sass.find("LDL") == std::string::npos); + // REQUIRE(sass.find("STL") == std::string::npos); + + size_t temp_storage_bytes = 0; + REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + + pointer_t temp_storage(temp_storage_bytes); + + REQUIRE(CUDA_SUCCESS + == cccl_device_scan(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + REQUIRE(CUDA_SUCCESS == cccl_device_scan_cleanup(&build)); +} + +using integral_types = std::tuple; +TEMPLATE_LIST_TEST_CASE("Scan works with integral types", "[scan]", integral_types) +{ + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + const std::vector input = generate(num_items); + const std::vector output(num_items, 0); + pointer_t input_ptr(input); + pointer_t output_ptr(output); + value_t init{TestType{42}}; + + scan(input_ptr, output_ptr, num_items, op, init); + + std::vector expected(num_items, 0); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_ptr)); + } +} + +struct pair +{ + short a; + size_t b; + + bool operator==(const pair& other) const + { + return a == other.a && b == other.b; + } +}; + +TEST_CASE("Scan works with custom types", "[scan]") +{ + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + + operation_t op = make_operation( + "op", + "struct pair { short a; size_t b; };\n" + "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n" + " return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n" + "}"); + const std::vector a = generate(num_items); + const std::vector b = generate(num_items); + std::vector input(num_items); + std::vector output(num_items); + for (std::size_t i = 0; i < num_items; ++i) + { + input[i] = pair{a[i], b[i]}; + } + pointer_t input_ptr(input); + pointer_t output_ptr(output); + value_t init{pair{4, 2}}; + + scan(input_ptr, output_ptr, num_items, op, init); + + std::vector expected(num_items, {0, 0}); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value, [](const pair& lhs, const pair& rhs) { + return pair{short(lhs.a + rhs.a), lhs.b + rhs.b}; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_ptr)); + } +} + +struct counting_iterator_state_t +{ + int value; +}; + +TEST_CASE("Scan works with input iterators", "[scan]") +{ + const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t input_it = make_iterator( + "struct counting_iterator_state_t { int value; };\n", + {"advance", + "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n" + " state->value += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 0; + pointer_t output_it(num_items); + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + // vector storing a sequence of values 0, 1, 2, ..., num_items - 1 + std::vector input(num_items); + std::iota(input.begin(), input.end(), 0); + + std::vector expected(num_items); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_it)); + } +} + +struct transform_output_iterator_state_t +{ + int* d_output; +}; + +TEST_CASE("Scan works with output iterators", "[scan]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"advance", + "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + const std::vector input = generate(num_items); + pointer_t input_it(input); + pointer_t inner_output_it(num_items); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + std::vector expected(num_items); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + + std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) { + return x * 2; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(inner_output_it)); + } +} + +template +struct constant_iterator_state_t +{ + T value; +}; + +TEST_CASE("Scan works with input and output iterators", "[scan]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { int value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"out_advance", + "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"out_dereference", + "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + pointer_t inner_output_it(num_items); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + std::vector expected(num_items, 1); + std::exclusive_scan(expected.begin(), expected.end(), expected.begin(), init.value); + std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) { + return x * 2; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(inner_output_it)); + } +} diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h index 3f9a25dc43e..cf7fb2851a3 100644 --- a/c/parallel/test/test_util.h +++ b/c/parallel/test/test_util.h @@ -27,6 +27,7 @@ #include #include #include +#include #include static std::string inspect_sass(const void* cubin, size_t cubin_size) From 59a125abc775a269a9f86a8566a101b21040e468 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 30 Jan 2025 10:41:25 -0800 Subject: [PATCH 10/27] Use fewer items per thread and reinstate LDL/STL check --- c/parallel/src/scan.cu | 6 ++++-- c/parallel/test/test_scan.cpp | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 2c259cbcd27..7317dcd377d 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -78,8 +78,10 @@ Tuning find_tuning(int cc, const Tuning (&tunings)[N]) scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/) { - // TODO(ashwin): returning some default for now: - return {128, 15, cub::LOAD_CA}; + // TODO: we should update this once we figure out a way to reuse + // tuning logic from C++. Alternately, we should implement + // something better than a hardcoded default: + return {128, 4, cub::LOAD_DEFAULT}; } static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init) diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp index f7373416d74..9ce88835caf 100644 --- a/c/parallel/test/test_scan.cpp +++ b/c/parallel/test/test_scan.cpp @@ -33,9 +33,9 @@ void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_ &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path)); const std::string sass = inspect_sass(build.cubin, build.cubin_size); - // TODO(ashwin): do we need the below? - // REQUIRE(sass.find("LDL") == std::string::npos); - // REQUIRE(sass.find("STL") == std::string::npos); + + REQUIRE(sass.find("LDL") == std::string::npos); + REQUIRE(sass.find("STL") == std::string::npos); size_t temp_storage_bytes = 0; REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); From 9925e113e6e71e253c5f6dae125edcd71c8e709b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 30 Jan 2025 10:58:04 -0800 Subject: [PATCH 11/27] Move load modifier check to policy --- c/parallel/src/scan.cu | 9 +++++++++ cub/cub/device/dispatch/dispatch_scan.cuh | 5 +---- cub/cub/device/dispatch/tuning/tuning_scan.cuh | 9 ++++++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 7317dcd377d..33e5ea9cbb9 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -60,6 +60,15 @@ struct scan_runtime_tuning_policy { return load_modifier; } + + void CheckLoadModifier() const + { + if (LoadModifier() == cub::CacheLoadModifier::LOAD_LDG) + { + throw std::runtime_error("The memory consistency model does not apply to texture " + "accesses"); + } + } }; template diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index e56f3f74dcd..85834b10a35 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -260,12 +260,9 @@ struct DispatchScan CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {}) { - // TODO(ashwin): Does this now need to be a runtime check? // `LOAD_LDG` makes in-place execution UB and doesn't lead to better // performance. - // static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG, - // "The memory consistency model does not apply to texture " - // "accesses"); + policy.CheckLoadModifier(); cudaError error = cudaSuccess; do diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 81c462bf7fa..1d4969714e8 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -433,10 +433,17 @@ struct ScanPolicyWrapper From c71bebe256b3778e291fc00640c9f178b6b6c3e5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 31 Jan 2025 11:24:08 -0800 Subject: [PATCH 12/27] Introduce detail functions to allocate/initialize tile state --- cub/cub/agent/single_pass_scan_operators.cuh | 111 +++++++++++++------ 1 file changed, 76 insertions(+), 35 deletions(-) diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index e605e4082f3..7c3bbc3677e 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -540,6 +540,65 @@ struct tile_state_with_memory_order return tile_state.template LoadValid(tile_idx); } }; + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(int num_tiles) +{ + return CUB_PTX_WARP_THREADS + num_tiles; +} + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t +tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles) +{ + // Specify storage allocation requirements + size_t allocation_sizes[3]; + + int num_tile_states = num_tiles_to_num_tile_states(num_tiles); + + // bytes needed for tile status descriptors + allocation_sizes[0] = num_tile_states * bytes_per_description; + + // bytes needed for partials + allocation_sizes[1] = num_tile_states * bytes_per_payload; + + // bytes needed for inclusives + allocation_sizes[2] = num_tile_states * bytes_per_payload; + + // Set the necessary size of the blob + size_t temp_storage_bytes = 0; + void* allocations[3] = {}; + AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); + + return temp_storage_bytes; +}; + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( + int bytes_per_description, + int bytes_per_payload, + int num_tiles, + void* d_temp_storage, + size_t temp_storage_bytes, + void* (&allocations)[3]) +{ + // Specify storage allocation requirements + size_t allocation_sizes[3]; + + int num_tile_states = num_tiles_to_num_tile_states(num_tiles); + + // bytes needed for tile status descriptors + allocation_sizes[0] = num_tile_states * bytes_per_description; + + // bytes needed for partials + allocation_sizes[1] = num_tile_states * bytes_per_payload; + + // bytes needed for inclusives + allocation_sizes[2] = num_tile_states * bytes_per_payload; + + // Set the necessary size of the blob + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + + return cudaSuccess; +} + } // namespace detail /** @@ -583,6 +642,9 @@ struct ScanTileState // Device storage TxnWord* d_tile_descriptors; + static constexpr size_t description_bytes_per_tile = sizeof(TxnWord); + static constexpr size_t payload_bytes_per_tile = 0; + /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_descriptors(nullptr) @@ -618,10 +680,11 @@ struct ScanTileState * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t + AllocationSize(int num_tiles, size_t& temp_storage_bytes) { - // bytes needed for tile status descriptors - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); + temp_storage_bytes = + detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); return cudaSuccess; } @@ -782,6 +845,9 @@ struct ScanTileState T* d_tile_partial; T* d_tile_inclusive; + static constexpr size_t description_bytes_per_tile = sizeof(StatusWord); + static constexpr size_t payload_bytes_per_tile = sizeof(Uninitialized); + /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_status(nullptr) @@ -810,25 +876,12 @@ struct ScanTileState do { void* allocations[3] = {}; - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // Compute allocation pointers into the single storage blob - error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - + error = detail::tile_state_init( + description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations); if (cudaSuccess != error) { break; } - // Alias the offsets d_tile_status = reinterpret_cast(allocations[0]); d_tile_partial = reinterpret_cast(allocations[1]); @@ -847,25 +900,13 @@ struct ScanTileState * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t + AllocationSize(int num_tiles, size_t& temp_storage_bytes) { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // Set the necessary size of the blob - void* allocations[3] = {}; - return CubDebug(detail::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes)); + temp_storage_bytes = + detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); + return cudaSuccess; } - /** * Initialize (from device) */ From 08d443eb3ccd45d75f5830a0591b66becb17cf7b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 1 Feb 2025 05:18:59 -0800 Subject: [PATCH 13/27] Update c.parallel scan_tile_state following c++ refactor --- c/parallel/src/scan.cu | 178 ++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 120 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 33e5ea9cbb9..e4e23cd0f09 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -160,9 +160,11 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu init_t); } +static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)"; + size_t find_size_t(char* ptx, std::string_view name) { - std::regex regex(std::format(R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)", name)); + std::regex regex(std::format(ptx_u64_assignment_regex, name)); std::cmatch match; if (std::regex_search(ptx, match, regex)) { @@ -175,6 +177,20 @@ size_t find_size_t(char* ptx, std::string_view name) } } +size_t find_size_t(char* ptx, std::string_view name, size_t default_value) +{ + std::regex regex(std::format(ptx_u64_assignment_regex, name)); + std::cmatch match; + if (std::regex_search(ptx, match, regex)) + { + return std::stoi(match[1].str()); + } + else + { + return default_value; + } +} + struct scan_tile_state { // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except @@ -188,95 +204,38 @@ struct scan_tile_state void* d_tile_partial; void* d_tile_inclusive; - bool is_primitive; - size_t status_size; - size_t uninitialized_size; + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; - scan_tile_state(bool is_primitive, size_t status_size, size_t uninitialized_size) + scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile) : d_tile_status(nullptr) , d_tile_partial(nullptr) , d_tile_inclusive(nullptr) - , is_primitive(is_primitive) - , status_size(status_size) - , uninitialized_size(uninitialized_size) + , description_bytes_per_tile(description_bytes_per_tile) + , payload_bytes_per_tile(payload_bytes_per_tile) {} cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) { - return is_primitive ? InitPrimitive(num_tiles, d_temp_storage, temp_storage_bytes) - : InitStorage(num_tiles, d_temp_storage, temp_storage_bytes); - } - - cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const - { - return is_primitive ? AllocationSizePrimitive(num_tiles, temp_storage_bytes) - : AllocationSizeStorage(num_tiles, temp_storage_bytes); - } - - cudaError_t InitPrimitive(int, void* d_temp_storage, size_t) - { - d_tile_status = d_temp_storage; + void* allocations[3] = {}; + auto status = cub::detail::tile_state_init( + description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations); + if (status != cudaSuccess) + { + return status; + } + d_tile_status = allocations[0]; + d_tile_partial = allocations[1]; + d_tile_inclusive = allocations[2]; return cudaSuccess; } - cudaError_t AllocationSizePrimitive(int num_tiles, size_t& d_temp_storage_bytes) const + cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const { - d_temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * status_size; + temp_storage_bytes = + cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); return cudaSuccess; } - - cudaError_t InitStorage(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) - { - cudaError_t error = cudaSuccess; - do - { - void* allocations[3] = {}; - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size; - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; - - // Compute allocation pointers into the single storage blob - error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - - if (cudaSuccess != error) - { - break; - } - - // Alias the offsets - d_tile_status = allocations[0]; - d_tile_partial = allocations[1]; - d_tile_inclusive = allocations[2]; - } while (0); - - return error; - } - - cudaError_t AllocationSizeStorage(int num_tiles, size_t& d_temp_storage_bytes) const - { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * status_size; - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * uninitialized_size; - - // Set the necessary size of the blob - void* allocations[3] = {}; - return CubDebug(cub::AliasTemporaries(nullptr, d_temp_storage_bytes, allocations, allocation_sizes)); - } }; template @@ -444,50 +403,29 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( constexpr size_t num_ptx_lto_args = 3; const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"}; - size_t status_size{0}; - size_t uninitialized_size{0}; - if (accum_t.type == cccl_type_enum::STORAGE) - { - std::string src = std::format( - "#include \n" - "#include \n" - "struct __align__({1}) storage_t {{\n" - " char data[{0}];\n" - "}};\n" - "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{2}>::StatusWord);\n" - "__device__ size_t uninitialized_size = sizeof(cub::Uninitialized<{2}>);\n", - accum_t.size, - accum_t.alignment, - accum_cpp); - auto compile_result = - make_nvrtc_command_list() - .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"}) - .compile_program({ptx_args, num_ptx_args}) - .cleanup_program() - .finalize_program(num_ptx_lto_args, ptx_lopts); - auto ptx_code = compile_result.cubin.get(); - status_size = scan::find_size_t(ptx_code, "status_size"); - uninitialized_size = scan::find_size_t(ptx_code, "uninitialized_size"); - } - else - { - std::string src = std::format( - "#include \n" - "#include \n" - "__device__ size_t status_size = sizeof(typename cub::ScanTileState<{0}>::TxnWord);\n", - accum_cpp); - auto compile_result = - make_nvrtc_command_list() - .add_program(nvrtc_translation_unit{src.c_str(), "tile_state_info"}) - .compile_program({ptx_args, num_ptx_args}) - .cleanup_program() - .finalize_program(num_ptx_lto_args, ptx_lopts); - auto ptx_code = compile_result.cubin.get(); - status_size = scan::find_size_t(ptx_code, "status_size"); - } - - bool is_primitive = not(accum_t.type == cccl_type_enum::STORAGE); - auto tile_state = std::make_unique(is_primitive, status_size, uninitialized_size); + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; + std::string ptx_src = std::format( + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;\n" + "__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;\n", + accum_t.size, + accum_t.alignment, + accum_cpp); + auto compile_result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"}) + .compile_program({ptx_args, num_ptx_args}) + .cleanup_program() + .finalize_program(num_ptx_lto_args, ptx_lopts); + auto ptx_code = compile_result.cubin.get(); + description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile"); + payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile", 0); + auto tile_state = std::make_unique(description_bytes_per_tile, payload_bytes_per_tile); build->cc = cc; build->cubin = (void*) result.cubin.release(); From 4060f59581a26dd08219489f1f901dac3747c7bd Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 3 Feb 2025 06:56:27 -0500 Subject: [PATCH 14/27] Update cub/cub/thread/thread_store.cuh Co-authored-by: Bernhard Manfred Gruber --- cub/cub/thread/thread_store.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index d4859c2c174..176032df891 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -358,8 +358,7 @@ ThreadStore(T* ptr, T val, detail::constant_t /*modifier*/, ::cuda::st template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) { - ThreadStore( - itr, val, detail::constant_v, detail::bool_constant_v<::cuda::std::is_pointer_v>); + ThreadStore(itr, val, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); } #endif // _CCCL_DOXYGEN_INVOKED From 0a9b9b51955ef1b2a1d23f7ed7c23a0b0ea1d17d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 3 Feb 2025 10:25:02 -0500 Subject: [PATCH 15/27] No initialize-then-modify --- cub/cub/agent/single_pass_scan_operators.cuh | 39 +++++++------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 7c3bbc3677e..a23487aed97 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -549,20 +549,14 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(i _CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles) { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - int num_tile_states = num_tiles_to_num_tile_states(num_tiles); - - // bytes needed for tile status descriptors - allocation_sizes[0] = num_tile_states * bytes_per_description; - - // bytes needed for partials - allocation_sizes[1] = num_tile_states * bytes_per_payload; - - // bytes needed for inclusives - allocation_sizes[2] = num_tile_states * bytes_per_payload; - + size_t allocation_sizes[]{ + // bytes needed for tile status descriptors + static_cast(num_tile_states * bytes_per_description), + // bytes needed for partials + static_cast(num_tile_states * bytes_per_payload), + // bytes needed for inclusives + static_cast(num_tile_states * bytes_per_payload)}; // Set the necessary size of the blob size_t temp_storage_bytes = 0; void* allocations[3] = {}; @@ -579,19 +573,14 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( size_t temp_storage_bytes, void* (&allocations)[3]) { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - int num_tile_states = num_tiles_to_num_tile_states(num_tiles); - - // bytes needed for tile status descriptors - allocation_sizes[0] = num_tile_states * bytes_per_description; - - // bytes needed for partials - allocation_sizes[1] = num_tile_states * bytes_per_payload; - - // bytes needed for inclusives - allocation_sizes[2] = num_tile_states * bytes_per_payload; + size_t allocation_sizes[]{ + // bytes needed for tile status descriptors + static_cast(num_tile_states * bytes_per_description), + // bytes needed for partials + static_cast(num_tile_states * bytes_per_payload), + // bytes needed for inclusives + static_cast(num_tile_states * bytes_per_payload)}; // Set the necessary size of the blob AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); From 39d6a1637d43b3eac0c513b886f5cb6e4e3f4bf5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 11:28:57 -0500 Subject: [PATCH 16/27] Use enum rather than bool --- c/parallel/src/scan.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index e4e23cd0f09..1eb86eb64ab 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -471,7 +471,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan( ::cuda::std::size_t, void, scan::dynamic_scan_policy_t<&scan::get_policy>, - false, + cub::ForceInclusive::No, scan::scan_kernel_source, cub::detail::CudaDriverLauncherFactory>:: Dispatch( From 02f6512ff4a764a75c3354e52f3b3b2dbf817f69 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 12:51:40 -0500 Subject: [PATCH 17/27] Return a std::optional from find_size_t --- c/parallel/src/scan.cu | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 1eb86eb64ab..529364d4226 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -162,7 +163,7 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)"; -size_t find_size_t(char* ptx, std::string_view name) +std::optional find_size_t(char* ptx, std::string_view name) { std::regex regex(std::format(ptx_u64_assignment_regex, name)); std::cmatch match; @@ -171,24 +172,7 @@ size_t find_size_t(char* ptx, std::string_view name) auto result = std::stoi(match[1].str()); return result; } - else - { - throw std::runtime_error(std::format("Could not find {} in PTX code", name)); - } -} - -size_t find_size_t(char* ptx, std::string_view name, size_t default_value) -{ - std::regex regex(std::format(ptx_u64_assignment_regex, name)); - std::cmatch match; - if (std::regex_search(ptx, match, regex)) - { - return std::stoi(match[1].str()); - } - else - { - return default_value; - } + return std::nullopt; } struct scan_tile_state @@ -403,8 +387,6 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( constexpr size_t num_ptx_lto_args = 3; const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"}; - size_t description_bytes_per_tile; - size_t payload_bytes_per_tile; std::string ptx_src = std::format( "#include \n" "#include \n" @@ -422,9 +404,21 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( .compile_program({ptx_args, num_ptx_args}) .cleanup_program() .finalize_program(num_ptx_lto_args, ptx_lopts); - auto ptx_code = compile_result.cubin.get(); - description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile"); - payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile", 0); + auto ptx_code = compile_result.cubin.get(); + + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; + auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile"); + if (maybe_description_bytes_per_tile) + { + description_bytes_per_tile = maybe_description_bytes_per_tile.value(); + } + else + { + throw std::runtime_error("Failed to find description_bytes_per_tile in PTX"); + } + payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0); + auto tile_state = std::make_unique(description_bytes_per_tile, payload_bytes_per_tile); build->cc = cc; From 2f2cea1f6d54488c102e604bc1e54df1cf34d715 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 12:52:43 -0500 Subject: [PATCH 18/27] Annotate arguments with their positions --- c/parallel/src/scan.cu | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 529364d4226..8632aef9664 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -149,16 +149,16 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t); return std::format( "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>", - chained_policy_t, - input_iterator_t, - output_iterator_t, - tile_state_t, - scan_op_t, - init_t, - offset_t, - accum_cpp_t, - "false", // for now, always exclusive - init_t); + chained_policy_t, // 0 + input_iterator_t, // 1 + output_iterator_t, // 2 + tile_state_t, // 3 + scan_op_t, // 4 + init_t, // 5 + offset_t, // 6 + accum_cpp_t, // 7 + "false", // 8 - for now, always exclusive + init_t); // 9 } static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)"; From 6ab362875f473a051cf00d41196040f017b90238 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 12:54:17 -0500 Subject: [PATCH 19/27] Minor improvements to command_list --- c/parallel/src/nvrtc/command_list.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h index 719d9c5bf88..f1003dcd18c 100644 --- a/c/parallel/src/nvrtc/command_list.h +++ b/c/parallel/src/nvrtc/command_list.h @@ -156,16 +156,16 @@ struct nvrtc_command_list_visitor check(jitlink_error); - bool ptx = false; - auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size); + bool output_ptx = false; + auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size); if (result != NVJITLINK_SUCCESS) { - ptx = true; - result = nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size); + output_ptx = true; + check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size)); } cleanup.cubin_ref.cubin = std::unique_ptr(new char[cleanup.cubin_ref.size]); - if (ptx) + if (output_ptx) { check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get())); } From ff416380eafce0d565eb5a04e5ab701ef451c91e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 12:59:15 -0500 Subject: [PATCH 20/27] Rename cubin->link_result --- c/parallel/src/for.cu | 6 +++--- c/parallel/src/nvrtc/command_list.h | 24 ++++++++++++------------ c/parallel/src/reduce.cu | 6 +++--- c/parallel/src/scan.cu | 8 ++++---- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu index 0abfb2a522c..2a5f412d43b 100644 --- a/c/parallel/src/for.cu +++ b/c/parallel/src/for.cu @@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build( .cleanup_program() .add_link({op.ltoir, op.ltoir_size}); - nvrtc_cubin result{}; + nvrtc_link_result result{}; if (cccl_iterator_kind_t::iterator == d_data.type) { @@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build( result = cl.finalize_program(num_lto_args, lopts); } - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; } catch (...) diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h index f1003dcd18c..a3d3724b35f 100644 --- a/c/parallel/src/nvrtc/command_list.h +++ b/c/parallel/src/nvrtc/command_list.h @@ -22,9 +22,9 @@ #include #include -struct nvrtc_cubin +struct nvrtc_link_result { - std::unique_ptr cubin{}; + std::unique_ptr data{}; size_t size; }; @@ -57,7 +57,7 @@ struct nvrtc_ltoir using nvrtc_ltoir_list = std::vector; struct nvrtc_jitlink_cleanup { - nvrtc_cubin& cubin_ref; + nvrtc_link_result& link_result_ref; }; struct nvrtc_jitlink @@ -157,21 +157,21 @@ struct nvrtc_command_list_visitor check(jitlink_error); bool output_ptx = false; - auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size); + auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size); if (result != NVJITLINK_SUCCESS) { output_ptx = true; - check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.cubin_ref.size)); + check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size)); } - cleanup.cubin_ref.cubin = std::unique_ptr(new char[cleanup.cubin_ref.size]); + cleanup.link_result_ref.data = std::unique_ptr(new char[cleanup.link_result_ref.size]); if (output_ptx) { - check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.cubin_ref.cubin.get())); + check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get())); } else { - check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get())); + check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get())); } } }; @@ -245,13 +245,13 @@ struct nvrtc_sm_top_level } // Execute steps and link unit - nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts) + nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts) { - nvrtc_cubin cubin{}; - nvrtc_jitlink_cleanup cleanup{cubin}; + nvrtc_link_result link_result{}; + nvrtc_jitlink_cleanup cleanup{link_result}; nvrtc_jitlink jl(numLtoOpts, ltoOpts); std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup))); - return cubin; + return link_result; } }; diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index f39c8a9f944..bc38090bcc2 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -348,7 +348,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size}); } - nvrtc_cubin result = + nvrtc_link_result result = make_nvrtc_command_list() .add_program(nvrtc_translation_unit{src.c_str(), name}) .add_expression({single_tile_kernel_name}) @@ -362,14 +362,14 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( .add_link_list(ltoir_list) .finalize_program(num_lto_args, lopts); - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name.c_str())); check(cuLibraryGetKernel( &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; build->accumulator_size = accum_t.size; } diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 8632aef9664..45201f4b760 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -366,7 +366,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size}); } - nvrtc_cubin result = + nvrtc_link_result result = make_nvrtc_command_list() .add_program(nvrtc_translation_unit{src.c_str(), name}) .add_expression({init_kernel_name}) @@ -378,7 +378,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( .add_link_list(ltoir_list) .finalize_program(num_lto_args, lopts); - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str())); @@ -404,7 +404,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( .compile_program({ptx_args, num_ptx_args}) .cleanup_program() .finalize_program(num_ptx_lto_args, ptx_lopts); - auto ptx_code = compile_result.cubin.get(); + auto ptx_code = compile_result.data.get(); size_t description_bytes_per_tile; size_t payload_bytes_per_tile; @@ -422,7 +422,7 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( auto tile_state = std::make_unique(description_bytes_per_tile, payload_bytes_per_tile); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; build->accumulator_type = accum_t; build->tile_state = (void*) tile_state.release(); From c4102fdfeddb6800e6faed589ac69a41a30d9e1a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 4 Feb 2025 14:08:51 -0500 Subject: [PATCH 21/27] Add a TODO for removing extra compile step --- c/parallel/src/scan.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 45201f4b760..a81e428439a 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -161,6 +161,13 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu init_t); // 9 } +// TODO: NVRTC doesn't currently support extracting basic type +// information (e.g., type sizes and alignments) from compiled +// LTO-IR. So we separately compile a small PTX file that defines the +// necessary types and constants and grep it for the required +// information. If/when NVRTC adds these features, we can remove this +// extra compilation step and get the information directly from the +// LTO-IR. static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)"; std::optional find_size_t(char* ptx, std::string_view name) From e3a2e7596ab6e228297c68d73a9c56bef4dcfb46 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 7 Feb 2025 06:58:16 -0500 Subject: [PATCH 22/27] Bad merge --- cub/cub/thread/thread_store.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index 176032df891..d4859c2c174 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -358,7 +358,8 @@ ThreadStore(T* ptr, T val, detail::constant_t /*modifier*/, ::cuda::st template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) { - ThreadStore(itr, val, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); + ThreadStore( + itr, val, detail::constant_v, detail::bool_constant_v<::cuda::std::is_pointer_v>); } #endif // _CCCL_DOXYGEN_INVOKED From fd63f70abe633faf2c37b477e622fd6d07746306 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 7 Feb 2025 07:28:51 -0500 Subject: [PATCH 23/27] Pass thrust path to PTX compile step --- c/parallel/src/scan.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index a81e428439a..1777876182f 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -389,8 +389,9 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str())); - constexpr size_t num_ptx_args = 5; - const char* ptx_args[num_ptx_args] = {arch.c_str(), cub_path, libcudacxx_path, "-rdc=true", "-dlto"}; + constexpr size_t num_ptx_args = 7; + const char* ptx_args[num_ptx_args] = { + arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; constexpr size_t num_ptx_lto_args = 3; const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"}; From 9fe8dfed5fa5660f5b0426313fa6aeb10724d28b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 10 Feb 2025 10:43:45 -0500 Subject: [PATCH 24/27] Fixes following merge from main --- c/parallel/src/kernels/operators.cpp | 2 +- c/parallel/src/kernels/operators.h | 2 +- c/parallel/src/merge_sort.cu | 6 +++--- c/parallel/src/reduce.cu | 2 +- c/parallel/src/scan.cu | 8 +++++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/c/parallel/src/kernels/operators.cpp b/c/parallel/src/kernels/operators.cpp index 87dc5ce6d30..bbde349993e 100644 --- a/c/parallel/src/kernels/operators.cpp +++ b/c/parallel/src/kernels/operators.cpp @@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper : std::format(stateful_binary_op_template, return_type)); } -std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation) +std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation) { return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T"); } diff --git a/c/parallel/src/kernels/operators.h b/c/parallel/src/kernels/operators.h index 2e8e11df39e..2e269857572 100644 --- a/c/parallel/src/kernels/operators.h +++ b/c/parallel/src/kernels/operators.h @@ -14,6 +14,6 @@ #include -std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation); +std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation); std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation); diff --git a/c/parallel/src/merge_sort.cu b/c/parallel/src/merge_sort.cu index 4ff17376e4f..edb03ceca17 100644 --- a/c/parallel/src/merge_sort.cu +++ b/c/parallel/src/merge_sort.cu @@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build( ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size}); } - nvrtc_cubin result = + nvrtc_link_result result = make_nvrtc_command_list() .add_program(nvrtc_translation_unit{src.c_str(), name}) .add_expression({block_sort_kernel_name}) @@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build( .add_link_list(ltoir_list) .finalize_program(num_lto_args, lopts); - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; } catch (const std::exception& exc) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index bc38090bcc2..09ee9268e92 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -273,7 +273,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( const std::string output_iterator_src = make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it); - const std::string op_src = make_kernel_user_arithmetic_operator(accum_cpp, op); + const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op); const std::string src = std::format( "#include \n" diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 1777876182f..844097a4867 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -294,8 +294,10 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); - const std::string input_iterator_src = make_kernel_input_iterator(offset_t, input_it_value_t, input_it); - const std::string output_iterator_src = make_kernel_output_iterator(offset_t, accum_cpp, output_it); + const std::string input_iterator_src = + make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it); + const std::string output_iterator_src = + make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it); const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op); @@ -472,8 +474,8 @@ extern "C" CCCL_C_API CUresult cccl_device_scan( indirect_arg_t, ::cuda::std::size_t, void, - scan::dynamic_scan_policy_t<&scan::get_policy>, cub::ForceInclusive::No, + scan::dynamic_scan_policy_t<&scan::get_policy>, scan::scan_kernel_source, cub::detail::CudaDriverLauncherFactory>:: Dispatch( From f0b1ed8c86ccc9219c301f50f364d844462d77d5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 10 Feb 2025 19:25:34 -0500 Subject: [PATCH 25/27] Return error from AliasTemporaries --- cub/cub/agent/single_pass_scan_operators.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index a23487aed97..2c41f8f6036 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -583,9 +583,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( static_cast(num_tile_states * bytes_per_payload)}; // Set the necessary size of the blob - AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - - return cudaSuccess; + return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); } } // namespace detail From f20219b479501b55066c5ecc2f7767b108fbe53c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 10 Feb 2025 19:25:42 -0500 Subject: [PATCH 26/27] Fix SFINAE --- cub/cub/device/dispatch/kernels/scan.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh index 7cb71ca49e7..b5b89d68756 100644 --- a/cub/cub/device/dispatch/kernels/scan.cuh +++ b/cub/cub/device/dispatch/kernels/scan.cuh @@ -162,8 +162,7 @@ template , InitValueT, typename InitValueT::value_type>> + typename RealInitValueT = typename InitValueT::value_type> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( InputIteratorT d_in, From 7a83fbc990af0395fde43721a72f31d931d8f0fc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 10 Feb 2025 19:25:55 -0500 Subject: [PATCH 27/27] Store description/payload bytes_per_tile directly in the build obj --- c/parallel/include/cccl/c/scan.h | 3 ++- c/parallel/src/scan.cu | 20 +++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h index df9f09588a5..7dc923df7e0 100644 --- a/c/parallel/include/cccl/c/scan.h +++ b/c/parallel/include/cccl/c/scan.h @@ -27,7 +27,8 @@ struct cccl_device_scan_build_result_t cccl_type_info accumulator_type; CUkernel init_kernel; CUkernel scan_kernel; - void* tile_state; + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; }; extern "C" CCCL_C_API CUresult cccl_device_scan_build( diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 844097a4867..d5209a58046 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -261,8 +261,7 @@ struct scan_kernel_source } scan_tile_state TileState() { - auto result = (reinterpret_cast(build.tile_state)); - return *result; + return {build.description_bytes_per_tile, build.payload_bytes_per_tile}; } }; @@ -429,13 +428,12 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_build( } payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0); - auto tile_state = std::make_unique(description_bytes_per_tile, payload_bytes_per_tile); - - build->cc = cc; - build->cubin = (void*) result.data.release(); - build->cubin_size = result.size; - build->accumulator_type = accum_t; - build->tile_state = (void*) tile_state.release(); + build->cc = cc; + build->cubin = (void*) result.data.release(); + build->cubin_size = result.size; + build->accumulator_type = accum_t; + build->description_bytes_per_tile = description_bytes_per_tile; + build->payload_bytes_per_tile = payload_bytes_per_tile; } catch (const std::exception& exc) { @@ -519,12 +517,8 @@ extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_r { return CUDA_ERROR_INVALID_VALUE; } - std::unique_ptr cubin(reinterpret_cast(bld_ptr->cubin)); check(cuLibraryUnload(bld_ptr->library)); - - std::unique_ptr tile_state(reinterpret_cast(bld_ptr->tile_state)); - tile_state.reset(); } catch (const std::exception& exc) {