diff --git a/c/parallel/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h index f1e875087a7..2366024a532 100644 --- a/c/parallel/include/cccl/c/reduce.h +++ b/c/parallel/include/cccl/c/reduce.h @@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( cccl_value_t init, CUstream stream) noexcept; -extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr); +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept; diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h new file mode 100644 index 00000000000..7dc923df7e0 --- /dev/null +++ b/c/parallel/include/cccl/c/scan.h @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA Core Compute Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL + +#include + +#include + +struct cccl_device_scan_build_result_t +{ + int cc; + void* cubin; + size_t cubin_size; + CUlibrary library; + cccl_type_info accumulator_type; + CUkernel init_kernel; + CUkernel scan_kernel; + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; +}; + +extern "C" CCCL_C_API CUresult cccl_device_scan_build( + cccl_device_scan_build_result_t* build, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_scan( + cccl_device_scan_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept; diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu index 0abfb2a522c..2a5f412d43b 100644 --- a/c/parallel/src/for.cu +++ b/c/parallel/src/for.cu @@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build( .cleanup_program() .add_link({op.ltoir, op.ltoir_size}); - nvrtc_cubin result{}; + nvrtc_link_result result{}; if (cccl_iterator_kind_t::iterator == d_data.type) { @@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build( result = cl.finalize_program(num_lto_args, lopts); } - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; } catch (...) diff --git a/c/parallel/src/kernels/operators.cpp b/c/parallel/src/kernels/operators.cpp index 87dc5ce6d30..bbde349993e 100644 --- a/c/parallel/src/kernels/operators.cpp +++ b/c/parallel/src/kernels/operators.cpp @@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper : std::format(stateful_binary_op_template, return_type)); } -std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation) +std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation) { return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T"); } diff --git a/c/parallel/src/kernels/operators.h b/c/parallel/src/kernels/operators.h index 2e8e11df39e..2e269857572 100644 --- a/c/parallel/src/kernels/operators.h +++ b/c/parallel/src/kernels/operators.h @@ -14,6 +14,6 @@ #include -std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation); +std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation); std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation); diff --git a/c/parallel/src/merge_sort.cu b/c/parallel/src/merge_sort.cu index 4ff17376e4f..edb03ceca17 100644 --- a/c/parallel/src/merge_sort.cu +++ b/c/parallel/src/merge_sort.cu @@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build( ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size}); } - nvrtc_cubin result = + nvrtc_link_result result = make_nvrtc_command_list() .add_program(nvrtc_translation_unit{src.c_str(), name}) .add_expression({block_sort_kernel_name}) @@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build( .add_link_list(ltoir_list) .finalize_program(num_lto_args, lopts); - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; } catch (const std::exception& exc) diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h index 77d83e86d47..a3d3724b35f 100644 --- a/c/parallel/src/nvrtc/command_list.h +++ b/c/parallel/src/nvrtc/command_list.h @@ -22,9 +22,9 @@ #include #include -struct nvrtc_cubin +struct nvrtc_link_result { - std::unique_ptr cubin{}; + std::unique_ptr data{}; size_t size; }; @@ -57,7 +57,7 @@ struct nvrtc_ltoir using nvrtc_ltoir_list = std::vector; struct nvrtc_jitlink_cleanup { - nvrtc_cubin& cubin_ref; + nvrtc_link_result& link_result_ref; }; struct nvrtc_jitlink @@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor check(jitlink_error); - check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size)); - cleanup.cubin_ref.cubin = std::unique_ptr(new char[cleanup.cubin_ref.size]); - check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get())); + bool output_ptx = false; + auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size); + if (result != NVJITLINK_SUCCESS) + { + output_ptx = true; + check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size)); + } + cleanup.link_result_ref.data = std::unique_ptr(new char[cleanup.link_result_ref.size]); + + if (output_ptx) + { + check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get())); + } + else + { + check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get())); + } } }; @@ -231,13 +245,13 @@ struct nvrtc_sm_top_level } // Execute steps and link unit - nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts) + nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts) { - nvrtc_cubin cubin{}; - nvrtc_jitlink_cleanup cleanup{cubin}; + nvrtc_link_result link_result{}; + nvrtc_jitlink_cleanup cleanup{link_result}; nvrtc_jitlink jl(numLtoOpts, ltoOpts); std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup))); - return cubin; + return link_result; } }; diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index da866bd6a76..09ee9268e92 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -40,6 +40,12 @@ static_assert(std::is_same_v, OffsetT>, "O struct nothing_t {}; +struct input_iterator_state_t; +struct output_iterator_t; + +namespace reduce +{ + struct reduce_runtime_tuning_policy { int block_size; @@ -107,9 +113,6 @@ static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*i return init.type; } -struct input_iterator_state_t; -struct output_iterator_t; - std::string get_input_iterator_name() { std::string iterator_t; @@ -201,6 +204,44 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it transform_op_t); } +template +struct dynamic_reduce_policy_t +{ + using MaxPolicy = dynamic_reduce_policy_t; + + template + cudaError_t Invoke(int device_ptx_version, F& op) + { + return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); + } + + cccl_type_info accumulator_type; +}; + +struct reduce_kernel_source +{ + cccl_device_reduce_build_result_t& build; + + std::size_t AccumSize() const + { + return build.accumulator_size; + } + CUkernel SingleTileKernel() const + { + return build.single_tile_kernel; + } + CUkernel SingleTileSecondKernel() const + { + return build.single_tile_second_kernel; + } + CUkernel ReductionKernel() const + { + return build.reduction_kernel; + } +}; + +} // namespace reduce + extern "C" CCCL_C_API CUresult cccl_device_reduce_build( cccl_device_reduce_build_result_t* build, cccl_iterator_t input_it, @@ -220,19 +261,19 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( { const char* name = "test"; - const int cc = cc_major * 10 + cc_minor; - const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); - const reduce_runtime_tuning_policy policy = get_policy(cc, accum_t); - const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); - const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); - const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init); + const auto policy = reduce::get_policy(cc, accum_t); + const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); + const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); const std::string input_iterator_src = make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it); const std::string output_iterator_src = make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it); - const std::string op_src = make_kernel_user_arithmetic_operator(accum_cpp, op); + const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op); const std::string src = std::format( "#include \n" @@ -271,9 +312,10 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( fflush(stdout); #endif - std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false); - std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true); - std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init); + std::string single_tile_kernel_name = reduce::get_single_tile_kernel_name(input_it, output_it, op, init, false); + std::string single_tile_second_kernel_name = + reduce::get_single_tile_kernel_name(input_it, output_it, op, init, true); + std::string reduction_kernel_name = reduce::get_device_reduce_kernel_name(op, input_it, init); std::string single_tile_kernel_lowered_name; std::string single_tile_second_kernel_lowered_name; std::string reduction_kernel_lowered_name; @@ -306,7 +348,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size}); } - nvrtc_cubin result = + nvrtc_link_result result = make_nvrtc_command_list() .add_program(nvrtc_translation_unit{src.c_str(), name}) .add_expression({single_tile_kernel_name}) @@ -320,14 +362,14 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( .add_link_list(ltoir_list) .finalize_program(num_lto_args, lopts); - cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name.c_str())); check(cuLibraryGetKernel( &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str())); check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str())); build->cc = cc; - build->cubin = (void*) result.cubin.release(); + build->cubin = (void*) result.data.release(); build->cubin_size = result.size; build->accumulator_size = accum_t.size; } @@ -342,42 +384,6 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce_build( return error; } -template -struct dynamic_reduce_policy_t -{ - using MaxPolicy = dynamic_reduce_policy_t; - - template - cudaError_t Invoke(int device_ptx_version, F& op) - { - return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); - } - - cccl_type_info accumulator_type; -}; - -struct reduce_kernel_source -{ - cccl_device_reduce_build_result_t& build; - - std::size_t AccumSize() const - { - return build.accumulator_size; - } - CUkernel SingleTileKernel() const - { - return build.single_tile_kernel; - } - CUkernel SingleTileSecondKernel() const - { - return build.single_tile_second_kernel; - } - CUkernel ReductionKernel() const - { - return build.reduction_kernel; - } -}; - extern "C" CCCL_C_API CUresult cccl_device_reduce( cccl_device_reduce_build_result_t build, void* d_temp_storage, @@ -404,9 +410,9 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( indirect_arg_t, indirect_arg_t, void, - dynamic_reduce_policy_t<&get_policy>, + reduce::dynamic_reduce_policy_t<&reduce::get_policy>, ::cuda::std::__identity, - reduce_kernel_source, + reduce::reduce_kernel_source, cub::detail::CudaDriverLauncherFactory>:: Dispatch( d_temp_storage, @@ -420,7 +426,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( {}, {build}, cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - {get_accumulator_type(op, d_in, init)}); + {reduce::get_accumulator_type(op, d_in, init)}); } catch (const std::exception& exc) { @@ -439,7 +445,7 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce( return error; } -extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept { try { diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu new file mode 100644 index 00000000000..d5209a58046 --- /dev/null +++ b/c/parallel/src/scan.cu @@ -0,0 +1,532 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "cub/util_device.cuh" +#include "kernels/iterators.h" +#include "kernels/operators.h" +#include "util/context.h" +#include "util/errors.h" +#include "util/indirect_arg.h" +#include "util/types.h" +#include +#include +#include + +struct op_wrapper; +struct device_scan_policy; +using OffsetT = unsigned long long; +static_assert(std::is_same_v, OffsetT>, "OffsetT must be size_t"); + +struct input_iterator_state_t; +struct output_iterator_t; + +namespace scan +{ + +struct scan_runtime_tuning_policy +{ + int block_size; + int items_per_thread; + cub::CacheLoadModifier load_modifier; + + scan_runtime_tuning_policy Scan() const + { + return *this; + } + + int ItemsPerThread() const + { + return items_per_thread; + } + + int BlockThreads() const + { + return block_size; + } + + cub::CacheLoadModifier LoadModifier() const + { + return load_modifier; + } + + void CheckLoadModifier() const + { + if (LoadModifier() == cub::CacheLoadModifier::LOAD_LDG) + { + throw std::runtime_error("The memory consistency model does not apply to texture " + "accesses"); + } + } +}; + +template +Tuning find_tuning(int cc, const Tuning (&tunings)[N]) +{ + for (const Tuning& tuning : tunings) + { + if (cc >= tuning.cc) + { + return tuning; + } + } + + return tunings[N - 1]; +} + +scan_runtime_tuning_policy get_policy(int /*cc*/, cccl_type_info /*accumulator_type*/) +{ + // TODO: we should update this once we figure out a way to reuse + // tuning logic from C++. Alternately, we should implement + // something better than a hardcoded default: + return {128, 4, cub::LOAD_DEFAULT}; +} + +static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init) +{ + // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet + // so switching back to the old accumulator type logic for now + return init.type; +} + +std::string get_input_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_output_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string +get_init_kernel_name(cccl_iterator_t input_it, cccl_iterator_t /*output_it*/, cccl_op_t op, cccl_value_t init) +{ + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + return std::format("cub::detail::scan::DeviceScanInitKernel>", accum_cpp_t); +} + +std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + const std::string input_iterator_t = + (input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : scan::get_input_iterator_name()); + const std::string output_iterator_t = + output_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(output_it.value_type.type, true) // + : scan::get_output_iterator_name(); + const std::string init_t = cccl_type_enum_to_name(init.type.type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string scan_op_t; + check(nvrtcGetTypeName(&scan_op_t)); + + auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t); + return std::format( + "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>", + chained_policy_t, // 0 + input_iterator_t, // 1 + output_iterator_t, // 2 + tile_state_t, // 3 + scan_op_t, // 4 + init_t, // 5 + offset_t, // 6 + accum_cpp_t, // 7 + "false", // 8 - for now, always exclusive + init_t); // 9 +} + +// TODO: NVRTC doesn't currently support extracting basic type +// information (e.g., type sizes and alignments) from compiled +// LTO-IR. So we separately compile a small PTX file that defines the +// necessary types and constants and grep it for the required +// information. If/when NVRTC adds these features, we can remove this +// extra compilation step and get the information directly from the +// LTO-IR. +static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)"; + +std::optional find_size_t(char* ptx, std::string_view name) +{ + std::regex regex(std::format(ptx_u64_assignment_regex, name)); + std::cmatch match; + if (std::regex_search(ptx, match, regex)) + { + auto result = std::stoi(match[1].str()); + return result; + } + return std::nullopt; +} + +struct scan_tile_state +{ + // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except + // that it accepts the acummulator type as a runtime parameter rather than being + // templated on it. + // + // Both specializations ScanTileStateT and ScanTileStateT - where the + // bool parameter indicates whether `T` is primitive - are combined into a single type. + + void* d_tile_status; // d_tile_descriptors + void* d_tile_partial; + void* d_tile_inclusive; + + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; + + scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile) + : d_tile_status(nullptr) + , d_tile_partial(nullptr) + , d_tile_inclusive(nullptr) + , description_bytes_per_tile(description_bytes_per_tile) + , payload_bytes_per_tile(payload_bytes_per_tile) + {} + + cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) + { + void* allocations[3] = {}; + auto status = cub::detail::tile_state_init( + description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations); + if (status != cudaSuccess) + { + return status; + } + d_tile_status = allocations[0]; + d_tile_partial = allocations[1]; + d_tile_inclusive = allocations[2]; + return cudaSuccess; + } + + cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const + { + temp_storage_bytes = + cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); + return cudaSuccess; + } +}; + +template +struct dynamic_scan_policy_t +{ + using MaxPolicy = dynamic_scan_policy_t; + + template + cudaError_t Invoke(int device_ptx_version, F& op) + { + return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); + } + + cccl_type_info accumulator_type; +}; + +struct scan_kernel_source +{ + cccl_device_scan_build_result_t& build; + + std::size_t AccumSize() const + { + return build.accumulator_type.size; + } + CUkernel InitKernel() const + { + return build.init_kernel; + } + CUkernel ScanKernel() const + { + return build.scan_kernel; + } + scan_tile_state TileState() + { + return {build.description_bytes_per_tile, build.payload_bytes_per_tile}; + } +}; + +} // namespace scan + +extern "C" CCCL_C_API CUresult cccl_device_scan_build( + cccl_device_scan_build_result_t* build, + cccl_iterator_t input_it, + cccl_iterator_t output_it, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept +{ + CUresult error = CUDA_SUCCESS; + + try + { + const char* name = "test"; + + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); + const auto policy = scan::get_policy(cc, accum_t); + const auto accum_cpp = cccl_type_enum_to_string(accum_t.type); + const auto input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const auto offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + + const std::string input_iterator_src = + make_kernel_input_iterator(offset_t, "input_iterator_state_t", input_it_value_t, input_it); + const std::string output_iterator_src = + make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it); + + const std::string op_src = make_kernel_user_binary_operator(accum_cpp, op); + + const std::string src = std::format( + "#include \n" + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "{4}\n" + "{5}\n" + "struct agent_policy_t {{\n" + " static constexpr int ITEMS_PER_THREAD = {2};\n" + " static constexpr int BLOCK_THREADS = {3};\n" + " static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_WARP_TRANSPOSE;\n" + " static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;\n" + " static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = cub::BLOCK_STORE_WARP_TRANSPOSE;\n" + " static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS;\n" + " struct detail {{\n" + " using delay_constructor_t = cub::detail::default_delay_constructor_t<{7}>;\n" + " }};\n" + "}};\n" + "struct device_scan_policy {{\n" + " struct ActivePolicy {{\n" + " using ScanPolicyT = agent_policy_t;\n" + " }};\n" + "}};\n" + "{6};\n", + input_it.value_type.size, // 0 + input_it.value_type.alignment, // 1 + policy.items_per_thread, // 2 + policy.block_size, // 3 + input_iterator_src, // 4 + output_iterator_src, // 5 + op_src, // 6 + accum_cpp); // 7 + +#if false // CCCL_DEBUGGING_SWITCH + fflush(stderr); + printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str()); + fflush(stdout); +#endif + + std::string init_kernel_name = scan::get_init_kernel_name(input_it, output_it, op, init); + std::string scan_kernel_name = scan::get_scan_kernel_name(input_it, output_it, op, init); + std::string init_kernel_lowered_name; + std::string scan_kernel_lowered_name; + + const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor); + + constexpr size_t num_args = 7; + const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; + + constexpr size_t num_lto_args = 2; + const char* lopts[num_lto_args] = {"-lto", arch.c_str()}; + + // Collect all LTO-IRs to be linked. + nvrtc_ltoir_list ltoir_list; + auto ltoir_list_append = [<oir_list](nvrtc_ltoir lto) { + if (lto.ltsz) + { + ltoir_list.push_back(std::move(lto)); + } + }; + ltoir_list_append({op.ltoir, op.ltoir_size}); + if (cccl_iterator_kind_t::iterator == input_it.type) + { + ltoir_list_append({input_it.advance.ltoir, input_it.advance.ltoir_size}); + ltoir_list_append({input_it.dereference.ltoir, input_it.dereference.ltoir_size}); + } + if (cccl_iterator_kind_t::iterator == output_it.type) + { + ltoir_list_append({output_it.advance.ltoir, output_it.advance.ltoir_size}); + ltoir_list_append({output_it.dereference.ltoir, output_it.dereference.ltoir_size}); + } + + nvrtc_link_result result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{src.c_str(), name}) + .add_expression({init_kernel_name}) + .add_expression({scan_kernel_name}) + .compile_program({args, num_args}) + .get_name({init_kernel_name, init_kernel_lowered_name}) + .get_name({scan_kernel_name, scan_kernel_lowered_name}) + .cleanup_program() + .add_link_list(ltoir_list) + .finalize_program(num_lto_args, lopts); + + cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + check(cuLibraryGetKernel(&build->init_kernel, build->library, init_kernel_lowered_name.c_str())); + check(cuLibraryGetKernel(&build->scan_kernel, build->library, scan_kernel_lowered_name.c_str())); + + constexpr size_t num_ptx_args = 7; + const char* ptx_args[num_ptx_args] = { + arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; + constexpr size_t num_ptx_lto_args = 3; + const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"}; + + std::string ptx_src = std::format( + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;\n" + "__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;\n", + accum_t.size, + accum_t.alignment, + accum_cpp); + auto compile_result = + make_nvrtc_command_list() + .add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"}) + .compile_program({ptx_args, num_ptx_args}) + .cleanup_program() + .finalize_program(num_ptx_lto_args, ptx_lopts); + auto ptx_code = compile_result.data.get(); + + size_t description_bytes_per_tile; + size_t payload_bytes_per_tile; + auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile"); + if (maybe_description_bytes_per_tile) + { + description_bytes_per_tile = maybe_description_bytes_per_tile.value(); + } + else + { + throw std::runtime_error("Failed to find description_bytes_per_tile in PTX"); + } + payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0); + + build->cc = cc; + build->cubin = (void*) result.data.release(); + build->cubin_size = result.size; + build->accumulator_type = accum_t; + build->description_bytes_per_tile = description_bytes_per_tile; + build->payload_bytes_per_tile = payload_bytes_per_tile; + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan_build(): %s\n", exc.what()); + fflush(stdout); + error = CUDA_ERROR_UNKNOWN; + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_scan( + cccl_device_scan_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept +{ + bool pushed = false; + CUresult error = CUDA_SUCCESS; + try + { + pushed = try_push_context(); + + CUdevice cu_device; + check(cuCtxGetDevice(&cu_device)); + auto cuda_error = cub::DispatchScan< + indirect_arg_t, + indirect_arg_t, + indirect_arg_t, + indirect_arg_t, + ::cuda::std::size_t, + void, + cub::ForceInclusive::No, + scan::dynamic_scan_policy_t<&scan::get_policy>, + scan::scan_kernel_source, + cub::detail::CudaDriverLauncherFactory>:: + Dispatch( + d_temp_storage, + *temp_storage_bytes, + d_in, + d_out, + op, + init, + num_items, + stream, + {build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, + {scan::get_accumulator_type(op, d_in, init)}); + if (cuda_error != cudaSuccess) + { + const char* errorString = cudaGetErrorString(cuda_error); // Get the error string + std::cerr << "CUDA error: " << errorString << std::endl; + } + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan(): %s\n", exc.what()); + fflush(stdout); + error = CUDA_ERROR_UNKNOWN; + } + if (pushed) + { + CUcontext cu_context; + cuCtxPopCurrent(&cu_context); + } + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept +{ + try + { + if (bld_ptr == nullptr) + { + return CUDA_ERROR_INVALID_VALUE; + } + std::unique_ptr cubin(reinterpret_cast(bld_ptr->cubin)); + check(cuLibraryUnload(bld_ptr->library)); + } + catch (const std::exception& exc) + { + fflush(stderr); + printf("\nEXCEPTION in cccl_device_scan_cleanup(): %s\n", exc.what()); + fflush(stdout); + return CUDA_ERROR_UNKNOWN; + } + + return CUDA_SUCCESS; +} diff --git a/c/parallel/test/test_scan.cpp b/c/parallel/test/test_scan.cpp new file mode 100644 index 00000000000..9ce88835caf --- /dev/null +++ b/c/parallel/test/test_scan.cpp @@ -0,0 +1,237 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +#include "test_util.h" + +void scan(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int cc_major = deviceProp.major; + const int cc_minor = deviceProp.minor; + + const char* cub_path = TEST_CUB_PATH; + const char* thrust_path = TEST_THRUST_PATH; + const char* libcudacxx_path = TEST_LIBCUDACXX_PATH; + const char* ctk_path = TEST_CTK_PATH; + + cccl_device_scan_build_result_t build; + REQUIRE(CUDA_SUCCESS + == cccl_device_scan_build( + &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path)); + + const std::string sass = inspect_sass(build.cubin, build.cubin_size); + + REQUIRE(sass.find("LDL") == std::string::npos); + REQUIRE(sass.find("STL") == std::string::npos); + + size_t temp_storage_bytes = 0; + REQUIRE(CUDA_SUCCESS == cccl_device_scan(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + + pointer_t temp_storage(temp_storage_bytes); + + REQUIRE(CUDA_SUCCESS + == cccl_device_scan(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + REQUIRE(CUDA_SUCCESS == cccl_device_scan_cleanup(&build)); +} + +using integral_types = std::tuple; +TEMPLATE_LIST_TEST_CASE("Scan works with integral types", "[scan]", integral_types) +{ + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + const std::vector input = generate(num_items); + const std::vector output(num_items, 0); + pointer_t input_ptr(input); + pointer_t output_ptr(output); + value_t init{TestType{42}}; + + scan(input_ptr, output_ptr, num_items, op, init); + + std::vector expected(num_items, 0); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_ptr)); + } +} + +struct pair +{ + short a; + size_t b; + + bool operator==(const pair& other) const + { + return a == other.a && b == other.b; + } +}; + +TEST_CASE("Scan works with custom types", "[scan]") +{ + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + + operation_t op = make_operation( + "op", + "struct pair { short a; size_t b; };\n" + "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n" + " return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n" + "}"); + const std::vector a = generate(num_items); + const std::vector b = generate(num_items); + std::vector input(num_items); + std::vector output(num_items); + for (std::size_t i = 0; i < num_items; ++i) + { + input[i] = pair{a[i], b[i]}; + } + pointer_t input_ptr(input); + pointer_t output_ptr(output); + value_t init{pair{4, 2}}; + + scan(input_ptr, output_ptr, num_items, op, init); + + std::vector expected(num_items, {0, 0}); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value, [](const pair& lhs, const pair& rhs) { + return pair{short(lhs.a + rhs.a), lhs.b + rhs.b}; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_ptr)); + } +} + +struct counting_iterator_state_t +{ + int value; +}; + +TEST_CASE("Scan works with input iterators", "[scan]") +{ + const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t input_it = make_iterator( + "struct counting_iterator_state_t { int value; };\n", + {"advance", + "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n" + " state->value += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 0; + pointer_t output_it(num_items); + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + // vector storing a sequence of values 0, 1, 2, ..., num_items - 1 + std::vector input(num_items); + std::iota(input.begin(), input.end(), 0); + + std::vector expected(num_items); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + if (num_items > 0) + { + REQUIRE(expected == std::vector(output_it)); + } +} + +struct transform_output_iterator_state_t +{ + int* d_output; +}; + +TEST_CASE("Scan works with output iterators", "[scan]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"advance", + "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + const std::vector input = generate(num_items); + pointer_t input_it(input); + pointer_t inner_output_it(num_items); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + std::vector expected(num_items); + std::exclusive_scan(input.begin(), input.end(), expected.begin(), init.value); + + std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) { + return x * 2; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(inner_output_it)); + } +} + +template +struct constant_iterator_state_t +{ + T value; +}; + +TEST_CASE("Scan works with input and output iterators", "[scan]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { int value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"out_advance", + "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"out_dereference", + "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + pointer_t inner_output_it(num_items); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + scan(input_it, output_it, num_items, op, init); + + std::vector expected(num_items, 1); + std::exclusive_scan(expected.begin(), expected.end(), expected.begin(), init.value); + std::transform(expected.begin(), expected.end(), expected.begin(), [](int x) { + return x * 2; + }); + if (num_items > 0) + { + REQUIRE(expected == std::vector(inner_output_it)); + } +} diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h index 3f9a25dc43e..cf7fb2851a3 100644 --- a/c/parallel/test/test_util.h +++ b/c/parallel/test/test_util.h @@ -27,6 +27,7 @@ #include #include #include +#include #include static std::string inspect_sass(const void* cubin, size_t cubin_size) diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index e605e4082f3..2c41f8f6036 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -540,6 +540,52 @@ struct tile_state_with_memory_order return tile_state.template LoadValid(tile_idx); } }; + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(int num_tiles) +{ + return CUB_PTX_WARP_THREADS + num_tiles; +} + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t +tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles) +{ + int num_tile_states = num_tiles_to_num_tile_states(num_tiles); + size_t allocation_sizes[]{ + // bytes needed for tile status descriptors + static_cast(num_tile_states * bytes_per_description), + // bytes needed for partials + static_cast(num_tile_states * bytes_per_payload), + // bytes needed for inclusives + static_cast(num_tile_states * bytes_per_payload)}; + // Set the necessary size of the blob + size_t temp_storage_bytes = 0; + void* allocations[3] = {}; + AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); + + return temp_storage_bytes; +}; + +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( + int bytes_per_description, + int bytes_per_payload, + int num_tiles, + void* d_temp_storage, + size_t temp_storage_bytes, + void* (&allocations)[3]) +{ + int num_tile_states = num_tiles_to_num_tile_states(num_tiles); + size_t allocation_sizes[]{ + // bytes needed for tile status descriptors + static_cast(num_tile_states * bytes_per_description), + // bytes needed for partials + static_cast(num_tile_states * bytes_per_payload), + // bytes needed for inclusives + static_cast(num_tile_states * bytes_per_payload)}; + + // Set the necessary size of the blob + return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); +} + } // namespace detail /** @@ -583,6 +629,9 @@ struct ScanTileState // Device storage TxnWord* d_tile_descriptors; + static constexpr size_t description_bytes_per_tile = sizeof(TxnWord); + static constexpr size_t payload_bytes_per_tile = 0; + /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_descriptors(nullptr) @@ -618,10 +667,11 @@ struct ScanTileState * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t + AllocationSize(int num_tiles, size_t& temp_storage_bytes) { - // bytes needed for tile status descriptors - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); + temp_storage_bytes = + detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); return cudaSuccess; } @@ -782,6 +832,9 @@ struct ScanTileState T* d_tile_partial; T* d_tile_inclusive; + static constexpr size_t description_bytes_per_tile = sizeof(StatusWord); + static constexpr size_t payload_bytes_per_tile = sizeof(Uninitialized); + /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_status(nullptr) @@ -810,25 +863,12 @@ struct ScanTileState do { void* allocations[3] = {}; - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // Compute allocation pointers into the single storage blob - error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - + error = detail::tile_state_init( + description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations); if (cudaSuccess != error) { break; } - // Alias the offsets d_tile_status = reinterpret_cast(allocations[0]); d_tile_partial = reinterpret_cast(allocations[1]); @@ -847,25 +887,13 @@ struct ScanTileState * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t + AllocationSize(int num_tiles, size_t& temp_storage_bytes) { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - - // bytes needed for tile status descriptors - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); - - // bytes needed for partials - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // bytes needed for inclusives - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); - - // Set the necessary size of the blob - void* allocations[3] = {}; - return CubDebug(detail::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes)); + temp_storage_bytes = + detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles); + return cudaSuccess; } - /** * Initialize (from device) */ diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh index 52c643f9707..66e2e8a87a9 100644 --- a/cub/cub/detail/launcher/cuda_driver.cuh +++ b/cub/cub/detail/launcher/cuda_driver.cuh @@ -103,6 +103,11 @@ struct CudaDriverLauncherFactory cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_fn, block_size, dynamic_smem_bytes)); } + _CCCL_HIDE_FROM_ABI cudaError_t MaxGridDimX(int& max_grid_dim_x) const + { + return static_cast(cuDeviceGetAttribute(&max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device)); + } + CUdevice device; int cc; }; diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 18b4c4f61c7..85834b10a35 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -83,7 +83,7 @@ struct DeviceScanKernelSource { using ScanTileStateT = typename cub::ScanTileState; - CUB_DEFINE_KERNEL_GETTER(ScanInitKernel, DeviceScanInitKernel) + CUB_DEFINE_KERNEL_GETTER(InitKernel, DeviceScanInitKernel) CUB_DEFINE_KERNEL_GETTER( ScanKernel, @@ -101,6 +101,11 @@ struct DeviceScanKernelSource { return sizeof(AccumT); } + + CUB_RUNTIME_FUNCTION ScanTileStateT TileState() + { + return ScanTileStateT(); + } }; } // namespace detail::scan @@ -165,9 +170,6 @@ struct DispatchScan static constexpr int INIT_KERNEL_THREADS = 128; - // The input value type - using InputT = cub::detail::value_t; - /// Device-accessible allocation of temporary storage. When nullptr, the /// required allocation size is written to \p temp_storage_bytes and no work /// is done. @@ -258,13 +260,9 @@ struct DispatchScan CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernelT init_kernel, ScanKernelT scan_kernel, ActivePolicyT policy = {}) { - using ScanTileStateT = typename KernelSource::ScanTileStateT; - // `LOAD_LDG` makes in-place execution UB and doesn't lead to better // performance. - static_assert(policy.LoadModifier() != CacheLoadModifier::LOAD_LDG, - "The memory consistency model does not apply to texture " - "accesses"); + policy.CheckLoadModifier(); cudaError error = cudaSuccess; do @@ -281,9 +279,12 @@ struct DispatchScan int tile_size = policy.Scan().BlockThreads() * policy.Scan().ItemsPerThread(); int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + auto tile_state = kernel_source.TileState(); + // Specify temporary storage allocation requirements size_t allocation_sizes[1]; - error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + + error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors @@ -313,7 +314,6 @@ struct DispatchScan } // Construct the tile status interface - ScanTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { @@ -346,9 +346,10 @@ struct DispatchScan // Get SM occupancy for scan_kernel int scan_sm_occupancy; - error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out - scan_kernel, - policy.Scan().BlockThreads())); + error = CubDebug(launcher_factory.MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + policy.Scan().BlockThreads())); if (cudaSuccess != error) { break; @@ -397,17 +398,15 @@ struct DispatchScan } } } while (0); - return error; } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {}) { - using ScanTileStateT = typename KernelSource::ScanTileStateT; - auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy); + auto wrapped_policy = detail::scan::MakeScanPolicyWrapper(active_policy); // Ensure kernels are instantiated. - return Invoke(kernel_source.ScanInitKernel(), kernel_source.ScanKernel(), wrapped_policy); + return Invoke(kernel_source.InitKernel(), kernel_source.ScanKernel(), wrapped_policy); } /** @@ -451,20 +450,20 @@ struct DispatchScan InitValueT init_value, OffsetT num_items, cudaStream_t stream, - KernelSource kernel_source = {}, - MaxPolicyT max_policy = {}) + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + MaxPolicyT max_policy = {}) { cudaError_t error; do { // Get PTX version int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); + error = CubDebug(launcher_factory.PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } - // Create dispatch functor DispatchScan dispatch( d_temp_storage, @@ -476,7 +475,8 @@ struct DispatchScan init_value, stream, ptx_version, - kernel_source); + kernel_source, + launcher_factory); // Dispatch to chained policy error = CubDebug(max_policy.Invoke(ptx_version, dispatch)); diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh index cc3034638bc..b5b89d68756 100644 --- a/cub/cub/device/dispatch/kernels/scan.cuh +++ b/cub/cub/device/dispatch/kernels/scan.cuh @@ -40,6 +40,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -159,7 +161,8 @@ template + bool ForceInclusive, + typename RealInitValueT = typename InitValueT::value_type> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( InputIteratorT d_in, @@ -170,8 +173,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) InitValueT init_value, OffsetT num_items) { - using RealInitValueT = typename InitValueT::value_type; - using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; + using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; // Thread block type for scanning input tiles using AgentScanT = detail::scan:: diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 81c462bf7fa..1d4969714e8 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -433,10 +433,17 @@ struct ScanPolicyWrapper diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index 0fb29526cad..d4859c2c174 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //-----------------------------------------------------------------------------