diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 621bf2b9070..020456fd81d 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -436,7 +436,7 @@ private: typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, - typename SelectedPolicy> + typename PolicyHub> friend class DispatchSegmentedSort; // Internal version without NVTX range diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index ae0c74d45fc..d4ca673ff93 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -108,8 +108,8 @@ template > -struct DispatchAdjacentDifference : public SelectedPolicy + typename PolicyHub = detail::adjacent_difference::policy_hub> +struct DispatchAdjacentDifference { using InputT = typename std::iterator_traits::value_type; @@ -167,8 +167,6 @@ struct DispatchAdjacentDifference : public SelectedPolicy { using AdjacentDifferencePolicyT = typename ActivePolicyT::AdjacentDifferencePolicy; - using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; - cudaError error = cudaSuccess; do @@ -256,7 +254,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) .doit(DeviceAdjacentDifferenceDifferenceKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, DifferenceOpT, @@ -297,8 +295,6 @@ struct DispatchAdjacentDifference : public SelectedPolicy DifferenceOpT difference_op, cudaStream_t stream) { - using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; - cudaError error = cudaSuccess; do { @@ -315,7 +311,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index f8415386158..43eccd08a26 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -285,9 +285,9 @@ template , - bool IsMemcpy = true> -struct DispatchBatchMemcpy : SelectedPolicy + typename PolicyHub = batch_memcpy::policy_hub, + bool IsMemcpy = true> +struct DispatchBatchMemcpy { //------------------------------------------------------------------------------ // TYPE ALIASES @@ -345,8 +345,6 @@ struct DispatchBatchMemcpy : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy; - // Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers using BLevBufferOffsetTileState = cub::ScanTileState; @@ -466,7 +464,7 @@ struct DispatchBatchMemcpy : SelectedPolicy auto init_scan_states_kernel = InitTileStateKernel; auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, InputBufferIt, OutputBufferIt, BufferSizeIteratorT, @@ -481,7 +479,7 @@ struct DispatchBatchMemcpy : SelectedPolicy IsMemcpy>; auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, BufferOffsetT, BlevBufferSrcsOutItT, BlevBufferDstsOutItT, @@ -651,8 +649,6 @@ struct DispatchBatchMemcpy : SelectedPolicy BufferOffsetT num_buffers, cudaStream_t stream) { - using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy; - cudaError_t error = cudaSuccess; // Get PTX version @@ -668,7 +664,7 @@ struct DispatchBatchMemcpy : SelectedPolicy d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { return error; diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index de0189490fb..7ba478e3c00 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -59,7 +59,7 @@ namespace for_each // The dispatch layer is in the detail namespace until we figure out tuning API template -struct dispatch_t : PolicyHubT +struct dispatch_t { OffsetT num_items; OpT op; @@ -75,7 +75,7 @@ struct dispatch_t : PolicyHubT CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */) { - using max_policy_t = typename dispatch_t::MaxPolicy; + using max_policy_t = typename PolicyHubT::MaxPolicy; if (num_items == 0) { @@ -132,8 +132,6 @@ struct dispatch_t : PolicyHubT CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */) { - using max_policy_t = typename dispatch_t::MaxPolicy; - if (num_items == 0) { return cudaSuccess; @@ -157,7 +155,7 @@ struct dispatch_t : PolicyHubT error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(block_threads), 0, stream) - .doit(detail::for_each::static_kernel, num_items, op); + .doit(detail::for_each::static_kernel, num_items, op); error = CubDebug(error); if (cudaSuccess != error) { @@ -182,8 +180,6 @@ struct dispatch_t : PolicyHubT CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream) { - using max_policy_t = typename dispatch_t::MaxPolicy; - int ptx_version = 0; cudaError_t error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) @@ -193,7 +189,7 @@ struct dispatch_t : PolicyHubT dispatch_t dispatch(num_items, op, stream); - error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHubT::MaxPolicy::Invoke(ptx_version, dispatch)); return error; } diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh index db43ceb1006..6e346316d48 100644 --- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh +++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh @@ -80,11 +80,11 @@ namespace for_each_in_extents // The dispatch layer is in the detail namespace until we figure out the tuning API template -class dispatch_t : PolicyHubT +class dispatch_t { using index_type = typename ExtentsType::index_type; using unsigned_index_type = ::cuda::std::make_unsigned_t; - using max_policy_t = typename dispatch_t::MaxPolicy; + using max_policy_t = typename PolicyHubT::MaxPolicy; // workaround for nvcc 11.1 bug related to deduction guides, vvv using array_type = ::cuda::std::array, ExtentsType::rank()>; @@ -190,8 +190,7 @@ public: _CCCL_NODISCARD CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(const ExtentsType& ext, const OpType& op, cudaStream_t stream) { - using max_policy_t = typename dispatch_t::MaxPolicy; - int ptx_version = 0; + int ptx_version = 0; _CUB_RETURN_IF_ERROR(CubDebug(PtxVersion(ptx_version))) dispatch_t dispatch(ext, op, stream); return CubDebug(max_policy_t::Invoke(ptx_version, dispatch)); diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 5536c545ac4..42a256b8e20 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -545,7 +545,7 @@ struct dispatch_histogram * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy + * @tparam PolicyHub * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ @@ -555,9 +555,9 @@ template , CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>> -struct DispatchHistogram : SelectedPolicy +struct DispatchHistogram { static_assert(NUM_CHANNELS <= 4, "Histograms only support up to 4 channels"); static_assert(NUM_ACTIVE_CHANNELS <= NUM_CHANNELS, @@ -921,7 +921,7 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; cudaError error = cudaSuccess; do @@ -1125,7 +1125,7 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; cudaError error = cudaSuccess; do @@ -1293,7 +1293,7 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; cudaError error = cudaSuccess; do @@ -1514,7 +1514,7 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; cudaError error = cudaSuccess; do diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 04f5c77e997..6d8c74c368a 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -334,8 +334,8 @@ template > -struct DispatchMergeSort : SelectedPolicy + typename PolicyHub = detail::merge_sort::policy_hub> +struct DispatchMergeSort { using KeyT = cub::detail::value_t; using ValueT = cub::detail::value_t; @@ -447,8 +447,6 @@ struct DispatchMergeSort : SelectedPolicy using BlockSortVSmemHelperT = cub::detail::vsmem_helper_impl; using MergeAgentVSmemHelperT = cub::detail::vsmem_helper_impl; - using MaxPolicyT = typename DispatchMergeSort::MaxPolicy; - cudaError error = cudaSuccess; if (num_items == 0) @@ -517,7 +515,7 @@ struct DispatchMergeSort : SelectedPolicy static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream) .doit( DeviceMergeSortBlockSortKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, @@ -602,7 +600,7 @@ struct DispatchMergeSort : SelectedPolicy THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true) .doit( - DeviceMergeSortMergeKernel, - typename DecomposerT = detail::identity_decomposer_t> -struct DispatchRadixSort : SelectedPolicy + typename PolicyHub = detail::radix::policy_hub, + typename DecomposerT = detail::identity_decomposer_t> +struct DispatchRadixSort { //------------------------------------------------------------------------------ // Constants @@ -863,6 +863,8 @@ struct DispatchRadixSort : SelectedPolicy // Whether this is a keys-only (or key-value) sort static constexpr bool KEYS_ONLY = std::is_same::value; + using max_policy_t = typename PolicyHub::MaxPolicy; + //------------------------------------------------------------------------------ // Problem state //------------------------------------------------------------------------------ @@ -1241,7 +1243,6 @@ struct DispatchRadixSort : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeOnesweep() { - using MaxPolicyT = typename DispatchRadixSort::MaxPolicy; // PortionOffsetT is used for offsets within a portion, and must be signed. using PortionOffsetT = int; using AtomicOffsetT = PortionOffsetT; @@ -1322,7 +1323,7 @@ struct DispatchRadixSort : SelectedPolicy constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; int histo_blocks_per_sm = 1; - auto histogram_kernel = DeviceRadixSortHistogramKernel; + auto histogram_kernel = DeviceRadixSortHistogramKernel; error = CubDebug( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0)); @@ -1371,7 +1372,7 @@ struct DispatchRadixSort : SelectedPolicy #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream) - .doit(DeviceRadixSortExclusiveSumKernel, d_bins); + .doit(DeviceRadixSortExclusiveSumKernel, d_bins); error = CubDebug(error); if (cudaSuccess != error) { @@ -1424,7 +1425,7 @@ struct DispatchRadixSort : SelectedPolicy #endif auto onesweep_kernel = DeviceRadixSortOnesweepKernel< - MaxPolicyT, + max_policy_t, IS_DESCENDING, KeyT, ValueT, @@ -1673,13 +1674,12 @@ struct DispatchRadixSort : SelectedPolicy CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeManyTiles(Int2Type) { // Invoke upsweep-downsweep - using MaxPolicyT = typename DispatchRadixSort::MaxPolicy; return InvokePasses( - DeviceRadixSortUpsweepKernel, - DeviceRadixSortUpsweepKernel, - RadixSortScanBinsKernel, - DeviceRadixSortDownsweepKernel, - DeviceRadixSortDownsweepKernel); + DeviceRadixSortUpsweepKernel, + DeviceRadixSortUpsweepKernel, + RadixSortScanBinsKernel, + DeviceRadixSortDownsweepKernel, + DeviceRadixSortDownsweepKernel); } template @@ -1747,7 +1747,6 @@ struct DispatchRadixSort : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchRadixSort::MaxPolicy; using SingleTilePolicyT = typename ActivePolicyT::SingleTilePolicy; // Return if empty problem, or if no bits to sort and double-buffering is used @@ -1780,7 +1779,7 @@ struct DispatchRadixSort : SelectedPolicy { // Small, single tile size return InvokeSingleTile( - DeviceRadixSortSingleTileKernel); + DeviceRadixSortSingleTileKernel); } else { @@ -1838,8 +1837,6 @@ struct DispatchRadixSort : SelectedPolicy cudaStream_t stream, DecomposerT decomposer = {}) { - using MaxPolicyT = typename DispatchRadixSort::MaxPolicy; - cudaError_t error; do { @@ -1867,7 +1864,7 @@ struct DispatchRadixSort : SelectedPolicy decomposer); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; @@ -1929,9 +1926,9 @@ template , - typename DecomposerT = detail::identity_decomposer_t> -struct DispatchSegmentedRadixSort : SelectedPolicy + typename PolicyHub = detail::radix::policy_hub, + typename DecomposerT = detail::identity_decomposer_t> +struct DispatchSegmentedRadixSort { //------------------------------------------------------------------------------ // Constants @@ -1940,6 +1937,8 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Whether this is a keys-only (or key-value) sort static constexpr bool KEYS_ONLY = std::is_same::value; + using max_policy_t = typename PolicyHub::MaxPolicy; + //------------------------------------------------------------------------------ // Parameter members //------------------------------------------------------------------------------ @@ -2292,8 +2291,6 @@ struct DispatchSegmentedRadixSort : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchSegmentedRadixSort::MaxPolicy; - // Return if empty problem, or if no bits to sort and double-buffering is used if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay)) { @@ -2307,7 +2304,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedRadixSortKernel< - MaxPolicyT, + max_policy_t, false, IS_DESCENDING, KeyT, @@ -2317,7 +2314,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy OffsetT, DecomposerT>, DeviceSegmentedRadixSortKernel< - MaxPolicyT, + max_policy_t, true, IS_DESCENDING, KeyT, @@ -2394,8 +2391,6 @@ struct DispatchSegmentedRadixSort : SelectedPolicy bool is_overwrite_okay, cudaStream_t stream) { - using MaxPolicyT = typename DispatchSegmentedRadixSort::MaxPolicy; - cudaError_t error; do { @@ -2425,7 +2420,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy ptx_version); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index a3c41d1e530..a30d489612c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -260,21 +260,21 @@ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, - typename SelectedPolicy = detail::reduce::policy_hub, - typename TransformOpT = ::cuda::std::__identity, - typename KernelSource = DeviceReduceKernelSource< - typename SelectedPolicy::MaxPolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT, - InitT, - AccumT, - TransformOpT>, + typename InitT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename PolicyHub = detail::reduce::policy_hub, + typename TransformOpT = ::cuda::std::__identity, + typename KernelSource = DeviceReduceKernelSource< + typename PolicyHub::MaxPolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitT, + AccumT, + TransformOpT>, typename KernelLauncherFactory = TripleChevronFactory> -struct DispatchReduce : SelectedPolicy +struct DispatchReduce { //--------------------------------------------------------------------------- // Problem state @@ -639,7 +639,7 @@ struct DispatchReduce : SelectedPolicy * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, @@ -744,16 +744,16 @@ template < typename InitT, typename AccumT = ::cuda::std:: __accumulator_t>, InitT>, - typename SelectedPolicyT = detail::reduce::policy_hub, - typename KernelSource = DeviceReduceKernelSource< - typename SelectedPolicyT::MaxPolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT, - InitT, - AccumT, - TransformOpT>, + typename PolicyHub = detail::reduce::policy_hub, + typename KernelSource = DeviceReduceKernelSource< + typename PolicyHub::MaxPolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitT, + AccumT, + TransformOpT>, typename KernelLauncherFactory = TripleChevronFactory> using DispatchTransformReduce = DispatchReduce; @@ -805,10 +805,10 @@ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, - typename SelectedPolicy = detail::reduce::policy_hub> -struct DispatchSegmentedReduce : SelectedPolicy + typename InitT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename PolicyHub = detail::reduce::policy_hub> +struct DispatchSegmentedReduce { //--------------------------------------------------------------------------- // Problem state @@ -996,12 +996,10 @@ struct DispatchSegmentedReduce : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchSegmentedReduce::MaxPolicy; - // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedReduceKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, @@ -1071,8 +1069,6 @@ struct DispatchSegmentedReduce : SelectedPolicy InitT init, cudaStream_t stream) { - using MaxPolicyT = typename DispatchSegmentedReduce::MaxPolicy; - if (num_segments <= 0) { return cudaSuccess; @@ -1105,7 +1101,7 @@ struct DispatchSegmentedReduce : SelectedPolicy ptx_version); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 22191a9e16d..28d60973862 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -209,7 +209,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy + * @tparam PolicyHub * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ @@ -221,17 +221,13 @@ template , - cub::detail::value_t>, - typename SelectedPolicy = // - detail::reduce_by_key::policy_hub< // - ReductionOpT, // - AccumT, // - cub::detail::non_void_value_t< // - UniqueOutputIteratorT, // - cub::detail::value_t>>> + typename AccumT = ::cuda::std::__accumulator_t, + cub::detail::value_t>, + typename PolicyHub = detail::reduce_by_key::policy_hub< + ReductionOpT, + AccumT, + cub::detail::non_void_value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- @@ -443,11 +439,10 @@ struct DispatchReduceByKey template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceCompactInitKernel, DeviceReduceByKeyKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, @@ -512,8 +507,6 @@ struct DispatchReduceByKey OffsetT num_items, cudaStream_t stream) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; - cudaError error = cudaSuccess; do @@ -540,7 +533,7 @@ struct DispatchReduceByKey stream); // Dispatch - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 2ed6d6cc18f..5c8f1e01d0f 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -178,7 +178,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy + * @tparam PolicyHub * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ @@ -188,7 +188,7 @@ template , cub::detail::value_t>> struct DeviceRleDispatch @@ -450,10 +450,9 @@ struct DeviceRleDispatch template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceCompactInitKernel, - DeviceRleSweepKernel, - ::cuda::std::_If::value, - cub::detail::value_t, - typename InitValueT::value_type>>, - typename SelectedPolicy = detail::scan::policy_hub, - bool ForceInclusive = false> -struct DispatchScan : SelectedPolicy + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If::value, + cub::detail::value_t, + typename InitValueT::value_type>>, + typename PolicyHub = detail::scan::policy_hub, + bool ForceInclusive = false> +struct DispatchScan { //--------------------------------------------------------------------- // Constants and Types @@ -508,12 +508,11 @@ struct DispatchScan : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchScan::MaxPolicy; using ScanTileStateT = typename cub::ScanTileState; // Ensure kernels are instantiated. return Invoke( DeviceScanInitKernel, - DeviceScanKernel, ::cuda::std::_If::value, cub::detail::value_t, InitValueT>>, - typename SelectedPolicy = + typename PolicyHub = detail::scan_by_key::policy_hub, ScanOpT>> -struct DispatchScanByKey : SelectedPolicy +struct DispatchScanByKey { //--------------------------------------------------------------------- // Constants and Types @@ -519,12 +519,10 @@ struct DispatchScanByKey : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchScanByKey::MaxPolicy; - // Ensure kernels are instantiated. return Invoke( DeviceScanByKeyInitKernel, - DeviceScanByKeyKernel> -struct DispatchSegmentedSort : SelectedPolicy + typename PolicyHub = detail::segmented_sort::policy_hub> +struct DispatchSegmentedSort { static constexpr int KEYS_ONLY = std::is_same::value; @@ -853,7 +853,6 @@ struct DispatchSegmentedSort : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; @@ -1026,6 +1025,8 @@ struct DispatchSegmentedSort : SelectedPolicy : (is_num_passes_odd) ? values_allocation.get() : d_values.Alternate()); + using MaxPolicyT = typename PolicyHub::MaxPolicy; + if (partition_segments) { // Partition input segments into size groups and assign specialized @@ -1092,8 +1093,6 @@ struct DispatchSegmentedSort : SelectedPolicy bool is_overwrite_okay, cudaStream_t stream) { - using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; - cudaError error = cudaSuccess; do @@ -1120,7 +1119,7 @@ struct DispatchSegmentedSort : SelectedPolicy stream); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; @@ -1234,12 +1233,11 @@ private: #else // CUB_RDC_ENABLED # define CUB_TEMP_DEVICE_CODE \ - using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; \ error = \ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ .doit( \ DeviceSegmentedSortContinuationKernel< \ - MaxPolicyT, \ + typename PolicyHub::MaxPolicy, \ LargeKernelT, \ SmallKernelT, \ KeyT, \ diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 67f33647d29..e9db3db1787 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -424,13 +424,13 @@ template , - cub::detail::value_t, - detail::select::per_partition_offset_t, - MayAlias, - KeepRejects>> -struct DispatchSelectIf : SelectedPolicy + bool MayAlias = false, + typename PolicyHub = detail::select::policy_hub, + cub::detail::value_t, + detail::select::per_partition_offset_t, + MayAlias, + KeepRejects>> +struct DispatchSelectIf { /****************************************************************************** * Types and constants @@ -755,12 +755,10 @@ struct DispatchSelectIf : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; - return Invoke( DeviceCompactInitKernel, DeviceSelectSweepKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -821,8 +819,6 @@ struct DispatchSelectIf : SelectedPolicy OffsetT num_items, cudaStream_t stream) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; - int ptx_version = 0; if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { @@ -842,7 +838,7 @@ struct DispatchSelectIf : SelectedPolicy stream, ptx_version); - return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + return CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index a8468ac4484..9db1ee1c661 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -153,16 +153,15 @@ DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumS * Dispatch ******************************************************************************/ -template < - typename InputIteratorT, - typename FirstOutputIteratorT, - typename SecondOutputIteratorT, - typename UnselectedOutputIteratorT, - typename NumSelectedIteratorT, - typename SelectFirstPartOp, - typename SelectSecondPartOp, - typename OffsetT, - typename SelectedPolicy = detail::three_way_partition::policy_hub, OffsetT>> +template , OffsetT>> struct DispatchThreeWayPartitionIf { /***************************************************************************** @@ -382,7 +381,7 @@ struct DispatchThreeWayPartitionIf template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; return Invoke( DeviceThreeWayPartitionInitKernel, DeviceThreeWayPartitionKernel< @@ -414,7 +413,7 @@ struct DispatchThreeWayPartitionIf OffsetT num_items, cudaStream_t stream) { - using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + using MaxPolicyT = typename PolicyHub::MaxPolicy; cudaError error = cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 3fb26e258b7..8a310b35ebb 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -213,9 +213,9 @@ template , detail::value_t>> -struct DispatchUniqueByKey : SelectedPolicy +struct DispatchUniqueByKey { /****************************************************************************** * Types and constants @@ -529,13 +529,11 @@ struct DispatchUniqueByKey : SelectedPolicy template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { - using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; - // Ensure kernels are instantiated. return Invoke( DeviceCompactInitKernel, DeviceUniqueByKeySweepKernel< - MaxPolicyT, + typename PolicyHub::MaxPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, @@ -595,8 +593,6 @@ struct DispatchUniqueByKey : SelectedPolicy OffsetT num_items, cudaStream_t stream) { - using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; - cudaError_t error; do { @@ -622,7 +618,7 @@ struct DispatchUniqueByKey : SelectedPolicy stream); // Dispatch to chained policy - error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index 5e43686beed..355332fa6a9 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -218,8 +218,8 @@ struct device_dummy_algorithm_policy_t template > -struct dispatch_dummy_algorithm_t : SelectedPolicy + typename PolicyHub = device_dummy_algorithm_policy_t> +struct dispatch_dummy_algorithm_t { using item_t = cub::detail::value_t; @@ -274,8 +274,6 @@ struct dispatch_dummy_algorithm_t : SelectedPolicy template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { - using max_policy_t = typename dispatch_dummy_algorithm_t::max_policy_t; - using vsmem_helper_t = cub::detail::vsmem_helper_fallback_policy_t< typename ActivePolicyT::DummyAlgorithmPolicy, typename ActivePolicyT::FallbackDummyAlgorithmPolicy, @@ -327,7 +325,7 @@ struct dispatch_dummy_algorithm_t : SelectedPolicy launch_config_info->config_vsmem_per_block = vsmem_helper_t::vsmem_per_block; THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_tiles, block_threads, 0, stream) - .doit(dummy_algorithm_kernel, + .doit(dummy_algorithm_kernel, d_in, d_out, num_items, @@ -349,8 +347,6 @@ struct dispatch_dummy_algorithm_t : SelectedPolicy launch_config_test_info_t* launch_config_info, cudaStream_t stream = 0) { - using max_policy_t = typename dispatch_dummy_algorithm_t::max_policy_t; - // Get PTX version int ptx_version = 0; cudaError error = cub::PtxVersion(ptx_version); @@ -372,7 +368,7 @@ struct dispatch_dummy_algorithm_t : SelectedPolicy ptx_version); // Dispatch to chained policy - error = max_policy_t::Invoke(ptx_version, dispatch); + error = PolicyHub::max_policy_t::Invoke(ptx_version, dispatch); if (cudaSuccess != error) { return error; diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index 1946bdfaf39..a0a78ed0d71 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -501,8 +501,8 @@ and passes it to the ``ChainedPolicy::Invoke`` function: .. code-block:: c++ template <..., // algorithm specific compile-time parameters - typename SelectedPolicy> // also called: PolicyHub - struct DispatchAlgorithm : SelectedPolicy { // TODO(bgruber): I see no need for inheritance, can we remove it? + typename PolicyHub> + struct DispatchAlgorithm { CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, ..., cudaStream stream) { if (/* no items to process */) { @@ -518,9 +518,8 @@ and passes it to the ``ChainedPolicy::Invoke`` function: { return error; } - using MaxPolicy = typename SelectedPolicy::MaxPolicy; DispatchAlgorithm dispatch(..., stream); - return CubDebug(MaxPolicy::Invoke(ptx_version, dispatch)); + return CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); } }; @@ -555,7 +554,7 @@ The dispatch object's ``Invoke`` function is then called with the best policy fo .. code-block:: c++ - template <..., typename SelectedPolicy = DefaultTuning> + template <..., typename PolicyHub = detail::algorithm::policy_hub> struct DispatchAlgorithm { template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE @@ -618,13 +617,12 @@ An agent policy could look like this: It's typically a collection of configuration values for the kernel launch configuration, work distribution setting, load and store algorithms to use, as well as load instruction cache modifiers. -Finally, the tuning looks like: +Finally, the tuning policy hub looks like: .. code-block:: c++ template - struct DeviceAlgorithmPolicy // also called tuning hub - { + struct policy_hub { // TuningRelevantParams... could be used for decision making, like element types used, iterator category, etc. // for SM35