diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index 4c74b73baf2..bf4984f7256 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -43,10 +43,11 @@ #include #include -#include +#include #include #include +#include CUB_NAMESPACE_BEGIN @@ -86,7 +87,7 @@ struct AgentBlockSort // Types and constants //--------------------------------------------------------------------- - static constexpr bool KEYS_ONLY = std::is_same::value; + static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v; using BlockMergeSortT = BlockMergeSort; @@ -469,7 +470,7 @@ struct AgentMerge struct TempStorage : Uninitialized<_TempStorage> {}; - static constexpr bool KEYS_ONLY = std::is_same::value; + static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh index 9b51a539303..c618e2e83a4 100644 --- a/cub/cub/device/device_for.cuh +++ b/cub/cub/device/device_for.cuh @@ -47,6 +47,7 @@ #include #include #include +#include #if __cccl_lib_mdspan # include diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index e8cc91e8420..056522e162d 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -38,6 +38,7 @@ #endif // no system header #include +#include #include #include #include @@ -53,277 +54,6 @@ CUB_NAMESPACE_BEGIN -namespace detail::merge_sort -{ - -/** - * @brief Helper class template that provides two agent template instantiations: one instantiated with the default - * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters - * twice: once for the default agent and once for the fallback agent - */ -template class AgentT, typename... AgentParamsT> -struct dual_policy_agent_helper_t -{ - using default_agent_t = AgentT; - using fallback_agent_t = AgentT; - - static constexpr auto default_size = sizeof(typename default_agent_t::TempStorage); - static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage); -}; - -/** - * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its - * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile - * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is - * applicable: we must either use the fallback for both or for none of the two agents. - */ -template -class merge_sort_vsmem_helper_t -{ -private: - // Default fallback policy with a smaller tile size - using fallback_policy_t = cub::detail::policy_wrapper_t; - - // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the - // default policy and one instantiated with the fallback policy - using block_sort_helper_t = dual_policy_agent_helper_t< - DefaultPolicyT, - fallback_policy_t, - merge_sort::AgentBlockSort, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - using default_block_sort_agent_t = typename block_sort_helper_t::default_agent_t; - using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t; - - // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the - // default policy and one instantiated with the fallback policy - using merge_helper_t = dual_policy_agent_helper_t< - DefaultPolicyT, - fallback_policy_t, - merge_sort::AgentMerge, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - using default_merge_agent_t = typename merge_helper_t::default_agent_t; - using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t; - - // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory - // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the - // available shared memory - static constexpr auto max_default_size = - (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); - static constexpr auto max_fallback_size = - (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); - static constexpr bool uses_fallback_policy = - (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block); - -public: - using policy_t = ::cuda::std::_If; - using block_sort_agent_t = - ::cuda::std::_If; - using merge_agent_t = ::cuda::std::_If; -}; -template -__launch_bounds__( - merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( - bool ping, - KeyInputIteratorT keys_in, - ValueInputIteratorT items_in, - KeyIteratorT keys_out, - ValueIteratorT items_out, - OffsetT keys_count, - KeyT* tmp_keys_out, - ValueT* tmp_items_out, - CompareOpT compare_op, - vsmem_t vsmem) -{ - using MergeSortHelperT = merge_sort_vsmem_helper_t< - typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - - using ActivePolicyT = typename MergeSortHelperT::policy_t; - - using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t; - - using VSmemHelperT = vsmem_helper_impl; - - // Static shared memory allocation - __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; - - // Get temporary storage - typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); - - AgentBlockSortT agent( - ping, - temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), - keys_count, - keys_out, - items_out, - tmp_keys_out, - tmp_items_out, - compare_op); - - agent.Process(); - - // If applicable, hints to discard modified cache lines for vsmem - VSmemHelperT::discard_temp_storage(temp_storage); -} - -// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with -// cudaErrorIllegalAddress. -template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( - bool ping, - KeyIteratorT keys_ping, - KeyT* keys_pong, - OffsetT keys_count, - OffsetT num_partitions, - OffsetT* merge_partitions, - CompareOpT compare_op, - OffsetT target_merged_tiles_number, - int items_per_tile) -{ - OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; - - if (partition_idx < num_partitions) - { - AgentPartition agent( - ping, - keys_ping, - keys_pong, - keys_count, - partition_idx, - merge_partitions, - compare_op, - target_merged_tiles_number, - items_per_tile, - num_partitions); - - agent.Process(); - } -} - -template -__launch_bounds__( - merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( - bool ping, - KeyIteratorT keys_ping, - ValueIteratorT items_ping, - OffsetT keys_count, - KeyT* keys_pong, - ValueT* items_pong, - CompareOpT compare_op, - OffsetT* merge_partitions, - OffsetT target_merged_tiles_number, - vsmem_t vsmem) -{ - using MergeSortHelperT = merge_sort_vsmem_helper_t< - typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - - using ActivePolicyT = typename MergeSortHelperT::policy_t; - - using AgentMergeT = typename MergeSortHelperT::merge_agent_t; - - using VSmemHelperT = vsmem_helper_impl; - - // Static shared memory allocation - __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; - - // Get temporary storage - typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); - - AgentMergeT agent( - ping, - temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), - keys_count, - keys_pong, - items_pong, - keys_ping, - items_ping, - compare_op, - merge_partitions, - target_merged_tiles_number); - - agent.Process(); - - // If applicable, hints to discard modified cache lines for vsmem - VSmemHelperT::discard_temp_storage(temp_storage); -} - -} // namespace detail::merge_sort - /******************************************************************************* * Policy ******************************************************************************/ diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index f35e89a133f..ce8b1aaff66 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -31,6 +31,7 @@ _CCCL_NV_DIAG_SUPPRESS(186) #include #include #include +#include #include #include diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh new file mode 100644 index 00000000000..1065313c20d --- /dev/null +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -0,0 +1,304 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +THRUST_NAMESPACE_BEGIN + +namespace cuda_cub::core +{ +// We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code +template +typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it); +} // namespace cuda_cub::core + +THRUST_NAMESPACE_END + +CUB_NAMESPACE_BEGIN + +namespace detail::merge_sort +{ + +/** + * @brief Helper class template that provides two agent template instantiations: one instantiated with the default + * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters + * twice: once for the default agent and once for the fallback agent + */ +template class AgentT, typename... AgentParamsT> +struct dual_policy_agent_helper_t +{ + using default_agent_t = AgentT; + using fallback_agent_t = AgentT; + + static constexpr auto default_size = sizeof(typename default_agent_t::TempStorage); + static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage); +}; + +/** + * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its + * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile + * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is + * applicable: we must either use the fallback for both or for none of the two agents. + */ +template +class merge_sort_vsmem_helper_t +{ +private: + // Default fallback policy with a smaller tile size + using fallback_policy_t = cub::detail::policy_wrapper_t; + + // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the + // default policy and one instantiated with the fallback policy + using block_sort_helper_t = dual_policy_agent_helper_t< + DefaultPolicyT, + fallback_policy_t, + merge_sort::AgentBlockSort, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>; + using default_block_sort_agent_t = typename block_sort_helper_t::default_agent_t; + using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t; + + // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the + // default policy and one instantiated with the fallback policy + using merge_helper_t = dual_policy_agent_helper_t< + DefaultPolicyT, + fallback_policy_t, + merge_sort::AgentMerge, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>; + using default_merge_agent_t = typename merge_helper_t::default_agent_t; + using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t; + + // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory + // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the + // available shared memory + static constexpr auto max_default_size = + (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); + static constexpr auto max_fallback_size = + (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); + static constexpr bool uses_fallback_policy = + (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block); + +public: + using policy_t = ::cuda::std::_If; + using block_sort_agent_t = + ::cuda::std::_If; + using merge_agent_t = ::cuda::std::_If; +}; +template +__launch_bounds__( + merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( + bool ping, + KeyInputIteratorT keys_in, + ValueInputIteratorT items_in, + KeyIteratorT keys_out, + ValueIteratorT items_out, + OffsetT keys_count, + KeyT* tmp_keys_out, + ValueT* tmp_items_out, + CompareOpT compare_op, + vsmem_t vsmem) +{ + using MergeSortHelperT = merge_sort_vsmem_helper_t< + typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>; + + using ActivePolicyT = typename MergeSortHelperT::policy_t; + + using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t; + + using VSmemHelperT = vsmem_helper_impl; + + // Static shared memory allocation + __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; + + // Get temporary storage + typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); + + AgentBlockSortT agent( + ping, + temp_storage, + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), + keys_count, + keys_out, + items_out, + tmp_keys_out, + tmp_items_out, + compare_op); + + agent.Process(); + + // If applicable, hints to discard modified cache lines for vsmem + VSmemHelperT::discard_temp_storage(temp_storage); +} + +// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with +// cudaErrorIllegalAddress. +template +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( + bool ping, + KeyIteratorT keys_ping, + KeyT* keys_pong, + OffsetT keys_count, + OffsetT num_partitions, + OffsetT* merge_partitions, + CompareOpT compare_op, + OffsetT target_merged_tiles_number, + int items_per_tile) +{ + OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; + + if (partition_idx < num_partitions) + { + AgentPartition agent( + ping, + keys_ping, + keys_pong, + keys_count, + partition_idx, + merge_partitions, + compare_op, + target_merged_tiles_number, + items_per_tile, + num_partitions); + + agent.Process(); + } +} + +template +__launch_bounds__( + merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( + bool ping, + KeyIteratorT keys_ping, + ValueIteratorT items_ping, + OffsetT keys_count, + KeyT* keys_pong, + ValueT* items_pong, + CompareOpT compare_op, + OffsetT* merge_partitions, + OffsetT target_merged_tiles_number, + vsmem_t vsmem) +{ + using MergeSortHelperT = merge_sort_vsmem_helper_t< + typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>; + + using ActivePolicyT = typename MergeSortHelperT::policy_t; + + using AgentMergeT = typename MergeSortHelperT::merge_agent_t; + + using VSmemHelperT = vsmem_helper_impl; + + // Static shared memory allocation + __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; + + // Get temporary storage + typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); + + AgentMergeT agent( + ping, + temp_storage, + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), + keys_count, + keys_pong, + items_pong, + keys_ping, + items_ping, + compare_op, + merge_partitions, + target_merged_tiles_number); + + agent.Process(); + + // If applicable, hints to discard modified cache lines for vsmem + VSmemHelperT::discard_temp_storage(temp_storage); +} + +} // namespace detail::merge_sort + +CUB_NAMESPACE_END diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index add033fd030..498f17c1259 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -48,6 +48,7 @@ #endif // no system header #include +#include #include // for backward compatibility #include @@ -68,19 +69,6 @@ CUB_NAMESPACE_BEGIN namespace detail { -/** - * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD` - * configurations of a given policy. - */ -// TODO(bgruber): this should be called something like "override_policy" -template -struct policy_wrapper_t : PolicyT -{ - static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_; - static constexpr int BLOCK_THREADS = BLOCK_THREADS_; - static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; -}; - /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ diff --git a/cub/cub/util_policy_wrapper_t.cuh b/cub/cub/util_policy_wrapper_t.cuh new file mode 100644 index 00000000000..0c74f1f4d22 --- /dev/null +++ b/cub/cub/util_policy_wrapper_t.cuh @@ -0,0 +1,55 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +CUB_NAMESPACE_BEGIN + +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document + +namespace detail +{ +/** + * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD` + * configurations of a given policy. + */ +// TODO(bgruber): this should be called something like "override_policy" +template +struct policy_wrapper_t : PolicyT +{ + static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_; + static constexpr int BLOCK_THREADS = BLOCK_THREADS_; + static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; +}; +} // namespace detail + +#endif // _CCCL_DOXYGEN_INVOKED + +CUB_NAMESPACE_END diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh index baba489c0ae..20228b44f69 100644 --- a/cub/cub/util_vsmem.cuh +++ b/cub/cub/util_vsmem.cuh @@ -43,15 +43,13 @@ #endif // no system header #include -#include +#include #include #include #include #include -#include - CUB_NAMESPACE_BEGIN #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -79,16 +77,16 @@ class vsmem_helper_impl { private: // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size` - static constexpr std::size_t line_size = 128; + static constexpr ::cuda::std::size_t line_size = 128; // The amount of shared memory or virtual shared memory required by the algorithm's agent - static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage); + static constexpr ::cuda::std::size_t required_smem = sizeof(typename AgentT::TempStorage); // Whether we need to allocate global memory-backed virtual shared memory static constexpr bool needs_vsmem = required_smem > max_smem_per_block; // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory - static constexpr std::size_t padding_bytes = + static constexpr ::cuda::std::size_t padding_bytes = (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size)); public: @@ -96,7 +94,7 @@ public: using static_temp_storage_t = ::cuda::std::_If; // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes - static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0; + static constexpr ::cuda::std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0; /** * @brief Used from within the device algorithm's kernel to get the temporary storage that can be @@ -115,7 +113,7 @@ public: * storage and taking a linear block id. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& - get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t) + get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, ::cuda::std::size_t) { return static_temp_storage; } @@ -138,7 +136,7 @@ public: * virtual shared memory as temporary storage and taking a linear block id. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& - get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id) + get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, ::cuda::std::size_t linear_block_id) { return *reinterpret_cast( static_cast(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id)); @@ -170,8 +168,8 @@ public: // Ensure all threads finished using temporary storage __syncthreads(); - const std::size_t linear_tid = threadIdx.x; - const std::size_t block_stride = line_size * blockDim.x; + const ::cuda::std::size_t linear_tid = threadIdx.x; + const ::cuda::std::size_t block_stride = line_size * blockDim.x; char* ptr = reinterpret_cast(&temp_storage); auto ptr_end = ptr + vsmem_per_block; @@ -186,7 +184,7 @@ public: }; template -constexpr bool use_fallback_agent() +_CCCL_HOST_DEVICE constexpr bool use_fallback_agent() { return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block) && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block); diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 91ddc155311..e16b6195c9f 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -58,6 +58,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") #include #include #include + #include extern "C" __global__ void kernel(int *ptr, int *errors) { diff --git a/thrust/testing/is_contiguous_iterator.cu b/thrust/testing/is_contiguous_iterator.cu index 4d6f0d0ea62..d0fc6a8cfc7 100644 --- a/thrust/testing/is_contiguous_iterator.cu +++ b/thrust/testing/is_contiguous_iterator.cu @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/thrust/thrust/detail/type_traits/has_nested_type.h b/thrust/thrust/detail/type_traits/has_nested_type.h index b9af8747377..3146c794ef6 100644 --- a/thrust/thrust/detail/type_traits/has_nested_type.h +++ b/thrust/thrust/detail/type_traits/has_nested_type.h @@ -35,9 +35,9 @@ using yes_type = char; \ using no_type = int; \ template \ - static yes_type test(typename S::nested_type_name*); \ + _CCCL_HOST_DEVICE static yes_type test(typename S::nested_type_name*); \ template \ - static no_type test(...); \ + _CCCL_HOST_DEVICE static no_type test(...); \ static bool const value = sizeof(test(0)) == sizeof(yes_type); \ using type = thrust::detail::integral_constant; \ }; diff --git a/thrust/thrust/detail/type_traits/is_thrust_pointer.h b/thrust/thrust/detail/type_traits/is_thrust_pointer.h new file mode 100644 index 00000000000..77dcb571259 --- /dev/null +++ b/thrust/thrust/detail/type_traits/is_thrust_pointer.h @@ -0,0 +1,60 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +THRUST_NAMESPACE_BEGIN +namespace detail +{ + +namespace pointer_traits_detail +{ + +template +struct pointer_raw_pointer_impl +{}; + +template +struct pointer_raw_pointer_impl +{ + using type = T*; +}; + +template +struct pointer_raw_pointer_impl> +{ + using type = typename Ptr::raw_pointer; +}; + +} // namespace pointer_traits_detail + +template +struct pointer_raw_pointer : pointer_traits_detail::pointer_raw_pointer_impl +{}; + +// this could be a lot better, but for our purposes, it's probably +// sufficient just to check if pointer_raw_pointer has meaning +template +struct is_thrust_pointer : is_metafunction_defined> +{}; + +} // namespace detail + +THRUST_NAMESPACE_END diff --git a/thrust/thrust/detail/type_traits/pointer_traits.h b/thrust/thrust/detail/type_traits/pointer_traits.h index 7570aba2242..baf4c0ca1dd 100644 --- a/thrust/thrust/detail/type_traits/pointer_traits.h +++ b/thrust/thrust/detail/type_traits/pointer_traits.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -140,31 +141,6 @@ struct rebind_pointer::t namespace pointer_traits_detail { -template -struct pointer_raw_pointer_impl -{}; - -template -struct pointer_raw_pointer_impl -{ - using type = T*; -}; - -template -struct pointer_raw_pointer_impl> -{ - using type = typename Ptr::raw_pointer; -}; - -} // namespace pointer_traits_detail - -template -struct pointer_raw_pointer : pointer_traits_detail::pointer_raw_pointer_impl -{}; - -namespace pointer_traits_detail -{ - template struct capture_address { @@ -327,12 +303,6 @@ struct is_void_pointer_system_convertible is_pointer_system_convertible> {}; -// this could be a lot better, but for our purposes, it's probably -// sufficient just to check if pointer_raw_pointer has meaning -template -struct is_thrust_pointer : is_metafunction_defined> -{}; - // avoid inspecting traits of the arguments if they aren't known to be pointers template struct lazy_is_pointer_convertible diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h index 268c67eb03c..3372b9eaf25 100644 --- a/thrust/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h @@ -51,6 +51,7 @@ # include # include # include +# include # include diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h new file mode 100644 index 00000000000..07c5eba0eaa --- /dev/null +++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h @@ -0,0 +1,57 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#pragma once + +#include + +#include + +#include + +THRUST_NAMESPACE_BEGIN + +namespace cuda_cub::core +{ + +// LoadIterator +// ------------ +// if trivial iterator is passed, wrap loads into LDG +// +template +struct LoadIterator +{ + using value_type = typename ::cuda::std::iterator_traits::value_type; + using size_type = typename ::cuda::std::iterator_traits::difference_type; + + using type = + ::cuda::std::conditional_t, + cub::CacheModifiedInputIterator, + It>; +}; // struct Iterator +} // namespace cuda_cub::core + +THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h new file mode 100644 index 00000000000..28c65c813ea --- /dev/null +++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h @@ -0,0 +1,60 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#pragma once + +#include + +#include +#include + +THRUST_NAMESPACE_BEGIN + +namespace cuda_cub::core +{ +template +typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE +make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */) +{ + return raw_pointer_cast(&*it); +} + +template +typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE +make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */) +{ + return it; +} + +template +typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it) +{ + return make_load_iterator_impl(it, typename is_contiguous_iterator::type()); +} + +} // namespace cuda_cub::core + +THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 186990f4a0b..56ee30c9947 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -43,6 +43,8 @@ #include #include +#include +#include #include #include #include @@ -503,42 +505,6 @@ THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t } } -// LoadIterator -// ------------ -// if trivial iterator is passed, wrap loads into LDG -// -template -struct LoadIterator -{ - using value_type = typename iterator_traits::value_type; - using size_type = typename iterator_traits::difference_type; - - using type = - ::cuda::std::conditional_t::value, - cub::CacheModifiedInputIterator, - It>; -}; // struct Iterator - -template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE -make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */) -{ - return raw_pointer_cast(&*it); -} - -template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE -make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */) -{ - return it; -} - -template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it) -{ - return make_load_iterator_impl(it, typename is_contiguous_iterator::type()); -} - template struct get_arch; diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h index 1240783736c..313f69d7758 100644 --- a/thrust/thrust/system/cuda/detail/scan_by_key.h +++ b/thrust/thrust/system/cuda/detail/scan_by_key.h @@ -53,6 +53,7 @@ # include # include # include +# include # include diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h index 95f6c503b15..e254e572a3f 100644 --- a/thrust/thrust/type_traits/is_contiguous_iterator.h +++ b/thrust/thrust/type_traits/is_contiguous_iterator.h @@ -31,13 +31,9 @@ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header -#include -#include -#include +#include -#include -#include -#include +#include // Needed for __gnu_cxx::__normal_iterator #if _CCCL_COMPILER(MSVC, <, 19, 16) // MSVC 2017 version 15.9 # include @@ -212,71 +208,8 @@ struct is_contiguous_iterator_impl || is_msvc_contiguous_iterator::value || proclaim_contiguous_iterator::value> {}; -// Type traits for contiguous iterators: -template -struct contiguous_iterator_traits -{ - static_assert(thrust::is_contiguous_iterator::value, - "contiguous_iterator_traits requires a contiguous iterator."); - - using raw_pointer = typename thrust::detail::pointer_traits())>::raw_pointer; -}; } // namespace detail -//! Converts a contiguous iterator type to its underlying raw pointer type. -template -using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits::raw_pointer; - -//! Converts a contiguous iterator to its underlying raw pointer. -template -_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it) - -> unwrap_contiguous_iterator_t -{ - static_assert(thrust::is_contiguous_iterator::value, - "unwrap_contiguous_iterator called with non-contiguous iterator."); - return thrust::raw_pointer_cast(&*it); -} - -namespace detail -{ -// Implementation for non-contiguous iterators -- passthrough. -template ::value> -struct try_unwrap_contiguous_iterator_impl -{ - using type = Iterator; - - static _CCCL_HOST_DEVICE type get(Iterator it) - { - return it; - } -}; - -// Implementation for contiguous iterators -- unwraps to raw pointer. -template -struct try_unwrap_contiguous_iterator_impl -{ - using type = unwrap_contiguous_iterator_t; - - static _CCCL_HOST_DEVICE type get(Iterator it) - { - return unwrap_contiguous_iterator(it); - } -}; -} // namespace detail - -//! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the -//! iterator type unmodified. -template -using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl::type; - -//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the -//! iterator unmodified. -template -_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t -{ - return detail::try_unwrap_contiguous_iterator_impl::get(it); -} - /*! \endcond */ diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h index 8566a510578..a53b297c438 100644 --- a/thrust/thrust/type_traits/is_trivially_relocatable.h +++ b/thrust/thrust/type_traits/is_trivially_relocatable.h @@ -34,6 +34,7 @@ #endif // no system header #include #include +#include #include #include diff --git a/thrust/thrust/type_traits/unwrap_contiguous_iterator.h b/thrust/thrust/type_traits/unwrap_contiguous_iterator.h new file mode 100644 index 00000000000..880dc970759 --- /dev/null +++ b/thrust/thrust/type_traits/unwrap_contiguous_iterator.h @@ -0,0 +1,93 @@ +/* + * Copyright 2008-2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +THRUST_NAMESPACE_BEGIN + +namespace detail +{ +// Type traits for contiguous iterators: +template +struct contiguous_iterator_traits +{ + static_assert(thrust::is_contiguous_iterator_v, + "contiguous_iterator_traits requires a contiguous iterator."); + + using raw_pointer = typename thrust::detail::pointer_traits())>::raw_pointer; +}; +} // namespace detail + +//! Converts a contiguous iterator type to its underlying raw pointer type. +template +using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits::raw_pointer; + +//! Converts a contiguous iterator to its underlying raw pointer. +template +_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it) + -> unwrap_contiguous_iterator_t +{ + static_assert(thrust::is_contiguous_iterator_v, + "unwrap_contiguous_iterator called with non-contiguous iterator."); + return thrust::raw_pointer_cast(&*it); +} + +namespace detail +{ +// Implementation for non-contiguous iterators -- passthrough. +template > +struct try_unwrap_contiguous_iterator_impl +{ + using type = Iterator; + + static _CCCL_HOST_DEVICE type get(Iterator it) + { + return it; + } +}; + +// Implementation for contiguous iterators -- unwraps to raw pointer. +template +struct try_unwrap_contiguous_iterator_impl +{ + using type = unwrap_contiguous_iterator_t; + + static _CCCL_HOST_DEVICE type get(Iterator it) + { + return unwrap_contiguous_iterator(it); + } +}; +} // namespace detail + +//! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the +//! iterator type unmodified. +template +using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl::type; + +//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the +//! iterator unmodified. +template +_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t +{ + return detail::try_unwrap_contiguous_iterator_impl::get(it); +} + +THRUST_NAMESPACE_END