diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu index 89e4bc485e9..778e8b44bc0 100644 --- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu +++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu @@ -52,20 +52,28 @@ struct policy_hub_t template void left(nvbench::state& state, nvbench::type_list) { - constexpr bool may_alias = false; - constexpr bool read_left = true; - using input_it_t = const T*; using output_it_t = T*; using difference_op_t = ::cuda::std::minus<>; using offset_t = cub::detail::choose_offset_t; #if !TUNE_BASE - using dispatch_t = cub:: - DispatchAdjacentDifference; + using dispatch_t = cub::DispatchAdjacentDifference< + input_it_t, + output_it_t, + difference_op_t, + offset_t, + cub::MayAlias::No, + cub::ReadOption::Left, + policy_hub_t>; #else - using dispatch_t = - cub::DispatchAdjacentDifference; + using dispatch_t = cub::DispatchAdjacentDifference< + input_it_t, + output_it_t, + difference_op_t, + offset_t, + cub::MayAlias::No, + cub::ReadOption::Left>; #endif // TUNE_BASE const auto elements = static_cast(state.get_int64("Elements{io}")); diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu index 07162bf602a..9a2d6c09afb 100644 --- a/cub/benchmarks/bench/copy/memcpy.cu +++ b/cub/benchmarks/bench/copy/memcpy.cu @@ -25,7 +25,7 @@ * ******************************************************************************/ -#include +#include // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1 @@ -184,8 +184,6 @@ void copy(nvbench::state& state, using buffer_offset_t = std::uint32_t; using block_offset_t = std::uint32_t; - constexpr bool is_memcpy = true; - #if !TUNE_BASE using policy_t = policy_hub_t; #else @@ -199,7 +197,7 @@ void copy(nvbench::state& state, buffer_offset_t, block_offset_t, policy_t, - is_memcpy>; + cub::CopyAlg::Memcpy>; thrust::device_vector input_buffer = generate(elements); thrust::device_vector output_buffer(elements); diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index b2ecd4f561c..7217ee32e6e 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -42,9 +42,6 @@ // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -constexpr bool keep_rejects = true; -constexpr bool may_alias = false; - #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT @@ -119,8 +116,7 @@ void flagged(nvbench::state& state, nvbench::type_list diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index 7d7d686b030..f4a15839668 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -42,9 +42,6 @@ // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -constexpr bool keep_rejects = true; -constexpr bool may_alias = false; - #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT @@ -145,8 +142,7 @@ void partition(nvbench::state& state, nvbench::type_list diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index bd04bcf3d43..27ce9f3b834 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -41,8 +41,8 @@ using value_t = cub::NullType; -constexpr bool is_descending = false; -constexpr bool is_overwrite_ok = false; +constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending; +constexpr bool is_overwrite_ok = false; #if !TUNE_BASE template @@ -105,10 +105,10 @@ constexpr std::size_t max_onesweep_temp_storage_size() using portion_offset = int; using onesweep_policy = typename policy_hub_t::policy_t::OnesweepPolicy; using agent_radix_sort_onesweep_t = - cub::AgentRadixSortOnesweep; + cub::AgentRadixSortOnesweep; using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; - using hist_agent = cub::AgentRadixSortHistogram; + using hist_agent = cub::AgentRadixSortHistogram; return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); @@ -144,9 +144,9 @@ void radix_sort_keys(std::integral_constant, nvbench::state& state, using key_t = T; #if !TUNE_BASE using policy_t = policy_hub_t; - using dispatch_t = cub::DispatchRadixSort; + using dispatch_t = cub::DispatchRadixSort; #else // TUNE_BASE - using dispatch_t = cub::DispatchRadixSort; + using dispatch_t = cub::DispatchRadixSort; #endif // TUNE_BASE constexpr int begin_bit = 0; @@ -207,7 +207,7 @@ void radix_sort_keys(std::integral_constant, nvbench::state& state, template void radix_sort_keys(std::integral_constant, nvbench::state&, nvbench::type_list) { - (void) is_descending; + (void) sort_order; (void) is_overwrite_ok; } diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 35d589f453e..160a515466e 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -39,8 +39,8 @@ // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -constexpr bool is_descending = false; -constexpr bool is_overwrite_ok = false; +constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending; +constexpr bool is_overwrite_ok = false; #if !TUNE_BASE template @@ -103,10 +103,10 @@ constexpr std::size_t max_onesweep_temp_storage_size() using portion_offset = int; using onesweep_policy = typename policy_hub_t::policy_t::OnesweepPolicy; using agent_radix_sort_onesweep_t = - cub::AgentRadixSortOnesweep; + cub::AgentRadixSortOnesweep; using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; - using hist_agent = cub::AgentRadixSortHistogram; + using hist_agent = cub::AgentRadixSortHistogram; return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); @@ -144,9 +144,9 @@ void radix_sort_values( using value_t = ValueT; #if !TUNE_BASE using policy_t = policy_hub_t; - using dispatch_t = cub::DispatchRadixSort; + using dispatch_t = cub::DispatchRadixSort; #else // TUNE_BASE - using dispatch_t = cub::DispatchRadixSort; + using dispatch_t = cub::DispatchRadixSort; #endif // TUNE_BASE constexpr int begin_bit = 0; @@ -212,7 +212,7 @@ void radix_sort_values( template void radix_sort_values(std::integral_constant, nvbench::state&, nvbench::type_list) { - (void) is_descending; + (void) sort_order; (void) is_overwrite_ok; } diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu index 8d793c67e44..7a5f05c00e8 100644 --- a/cub/benchmarks/bench/segmented_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_sort/keys.cu @@ -153,8 +153,8 @@ void seg_sort(nvbench::state& state, const thrust::device_vector& offsets, bit_entropy entropy) { - constexpr bool is_descending = false; - constexpr bool is_overwrite_ok = false; + constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending; + constexpr bool is_overwrite_ok = false; using offset_t = OffsetT; using begin_offset_it_t = const offset_t*; @@ -165,10 +165,10 @@ void seg_sort(nvbench::state& state, #if !TUNE_BASE using policy_t = device_seg_sort_policy_hub; using dispatch_t = // - cub::DispatchSegmentedSort; + cub::DispatchSegmentedSort; #else using dispatch_t = // - cub::DispatchSegmentedSort; + cub::DispatchSegmentedSort; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu index 3a180a65adc..2562cec6c76 100644 --- a/cub/benchmarks/bench/select/flagged.cu +++ b/cub/benchmarks/bench/select/flagged.cu @@ -40,8 +40,6 @@ // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -constexpr bool keep_rejects = false; - #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT @@ -78,17 +76,18 @@ struct policy_hub_t }; #endif // !TUNE_BASE -template -void select(nvbench::state& state, nvbench::type_list) +template +void select(nvbench::state& state, nvbench::type_list) { - using input_it_t = const T*; - using flag_it_t = const bool*; - using output_it_t = T*; - using num_selected_it_t = OffsetT*; - using select_op_t = cub::NullType; - using equality_op_t = cub::NullType; - using offset_t = OffsetT; - constexpr bool may_alias = MayAlias::value; + using input_it_t = const T*; + using flag_it_t = const bool*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = cub::NullType; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; + constexpr cub::SelectImpl selection_option = + InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select; #if !TUNE_BASE using policy_t = policy_hub_t; @@ -100,20 +99,18 @@ void select(nvbench::state& state, nvbench::type_list) select_op_t, equality_op_t, offset_t, - keep_rejects, - may_alias, + selection_option, policy_t>; #else // TUNE_BASE - using dispatch_t = cub::DispatchSelectIf< - input_it_t, - flag_it_t, - output_it_t, - num_selected_it_t, - select_op_t, - equality_op_t, - offset_t, - keep_rejects, - may_alias>; + using dispatch_t = + cub::DispatchSelectIf; #endif // !TUNE_BASE // Retrieve axis parameters @@ -165,18 +162,18 @@ void select(nvbench::state& state, nvbench::type_list) using ::cuda::std::false_type; using ::cuda::std::true_type; -#ifdef TUNE_MayAlias -using may_alias = nvbench::type_list; // expands to "false_type" or "true_type" -#else // !defined(TUNE_MayAlias) -using may_alias = nvbench::type_list; -#endif // TUNE_MayAlias +#ifdef TUNE_InPlace +using is_in_place = nvbench::type_list; // expands to "false_type" or "true_type" +#else // !defined(TUNE_InPlace) +using is_in_place = nvbench::type_list; +#endif // TUNE_InPlace // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI). using select_offset_types = nvbench::type_list; -NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias)) +NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place)) .set_name("base") - .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"}) + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu index 03d6d57a1ad..c391c76eb09 100644 --- a/cub/benchmarks/bench/select/if.cu +++ b/cub/benchmarks/bench/select/if.cu @@ -42,8 +42,6 @@ // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -constexpr bool keep_rejects = false; - #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT @@ -105,17 +103,18 @@ T value_from_entropy(double percentage) return static_cast(result); } -template -void select(nvbench::state& state, nvbench::type_list) +template +void select(nvbench::state& state, nvbench::type_list) { - using input_it_t = const T*; - using flag_it_t = cub::NullType*; - using output_it_t = T*; - using num_selected_it_t = OffsetT*; - using select_op_t = less_then_t; - using equality_op_t = cub::NullType; - using offset_t = OffsetT; - constexpr bool may_alias = MayAlias::value; + using input_it_t = const T*; + using flag_it_t = cub::NullType*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = less_then_t; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; + constexpr cub::SelectImpl selection_option = + InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select; #if !TUNE_BASE using policy_t = policy_hub_t; @@ -127,20 +126,18 @@ void select(nvbench::state& state, nvbench::type_list) select_op_t, equality_op_t, offset_t, - keep_rejects, - may_alias, + selection_option, policy_t>; #else // TUNE_BASE - using dispatch_t = cub::DispatchSelectIf< - input_it_t, - flag_it_t, - output_it_t, - num_selected_it_t, - select_op_t, - equality_op_t, - offset_t, - keep_rejects, - may_alias>; + using dispatch_t = + cub::DispatchSelectIf; #endif // TUNE_BASE // Retrieve axis parameters @@ -191,18 +188,18 @@ void select(nvbench::state& state, nvbench::type_list) using ::cuda::std::false_type; using ::cuda::std::true_type; -#ifdef TUNE_MayAlias -using may_alias = nvbench::type_list; // expands to "false_type" or "true_type" -#else // !defined(TUNE_MayAlias) -using may_alias = nvbench::type_list; -#endif // TUNE_MayAlias +#ifdef TUNE_InPlace +using is_in_place = nvbench::type_list; // expands to "false_type" or "true_type" +#else // !defined(TUNE_InPlace) +using is_in_place = nvbench::type_list; +#endif // TUNE_InPlace // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI). using select_offset_types = nvbench::type_list; -NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias)) +NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place)) .set_name("base") - .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"}) + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu index 110fe4d8fd2..88f469ae953 100644 --- a/cub/benchmarks/bench/select/unique.cu +++ b/cub/benchmarks/bench/select/unique.cu @@ -16,8 +16,6 @@ // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -constexpr bool keep_rejects = false; - #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT @@ -54,17 +52,18 @@ struct policy_hub_t }; #endif // !TUNE_BASE -template -static void unique(nvbench::state& state, nvbench::type_list) +template +static void unique(nvbench::state& state, nvbench::type_list) { - using input_it_t = const T*; - using flag_it_t = cub::NullType*; - using output_it_t = T*; - using num_selected_it_t = OffsetT*; - using select_op_t = cub::NullType; - using equality_op_t = ::cuda::std::equal_to<>; - using offset_t = OffsetT; - constexpr bool may_alias = MayAlias::value; + using input_it_t = const T*; + using flag_it_t = cub::NullType*; + using output_it_t = T*; + using num_selected_it_t = OffsetT*; + using select_op_t = cub::NullType; + using equality_op_t = ::cuda::std::equal_to<>; + using offset_t = OffsetT; + constexpr cub::SelectImpl selection_option = + InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select; #if !TUNE_BASE using policy_t = policy_hub_t; @@ -76,20 +75,18 @@ static void unique(nvbench::state& state, nvbench::type_list; #else // TUNE_BASE - using dispatch_t = cub::DispatchSelectIf< - input_it_t, - flag_it_t, - output_it_t, - num_selected_it_t, - select_op_t, - equality_op_t, - offset_t, - keep_rejects, - may_alias>; + using dispatch_t = + cub::DispatchSelectIf; #endif // TUNE_BASE // Retrieve axis parameters @@ -143,18 +140,18 @@ static void unique(nvbench::state& state, nvbench::type_list; // expands to "false_type" or "true_type" -#else // !defined(TUNE_MayAlias) -using may_alias = nvbench::type_list; -#endif // TUNE_MayAlias +#ifdef TUNE_InPlace +using is_in_place = nvbench::type_list; // expands to "false_type" or "true_type" +#else // !defined(TUNE_InPlace) +using is_in_place = nvbench::type_list; +#endif // TUNE_InPlace // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI). using select_offset_types = nvbench::type_list; -NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias)) +NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place)) .set_name("base") - .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"}) + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index 3f8ad71f590..e0da426ccfd 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -80,7 +80,7 @@ void bench_transform( { state.exec(exec_tag, [&](const nvbench::launch& launch) { cub::detail::transform::dispatch_t< - false, + cub::detail::transform::requires_stable_address::no, OffsetT, ::cuda::std::tuple, RandomAccessIteratorOut, diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index b1785651f12..d7036313495 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -177,8 +178,9 @@ struct partition_distinct_output_t * num_total_items() -> total number of items across all partitions (partition only) * update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected * - * @tparam KeepRejects - * Whether or not we push rejected items to the back of the output + * @tparam SelectImpl SelectionOpt + * SelectImpl indicating whether to partition, just selection or selection where the memory for the input and + * output may alias each other. */ template + SelectImpl SelectionOpt> struct AgentSelectIf { //--------------------------------------------------------------------- @@ -208,7 +209,9 @@ struct AgentSelectIf // updating a tile state. Similarly, we need to make sure that the load of previous tile states precede writing of // the stream-compacted items and, hence, we need a load acquire when reading those tile states. static constexpr MemoryOrder memory_order = - ((!KeepRejects) && MayAlias && (!loads_via_smem)) ? MemoryOrder::acquire_release : MemoryOrder::relaxed; + ((SelectionOpt == SelectImpl::SelectPotentiallyInPlace) && (!loads_via_smem)) + ? MemoryOrder::acquire_release + : MemoryOrder::relaxed; // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile // state in a helper class that enforces memory order on reads and writes @@ -847,7 +850,7 @@ struct AgentSelectIf 0, 0, num_tile_selections, - cub::Int2Type{}); + cub::Int2Type < SelectionOpt == SelectImpl::Partition > {}); return num_tile_selections; } @@ -930,7 +933,7 @@ struct AgentSelectIf num_selections_prefix, num_rejected_prefix, num_selections, - cub::Int2Type{}); + cub::Int2Type < SelectionOpt == SelectImpl::Partition > {}); return num_selections; } @@ -1019,33 +1022,4 @@ struct AgentSelectIf } // namespace select } // namespace detail -template -using partition_distinct_output_t CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the " - "public interface will be removed.") = - detail::select::partition_distinct_output_t; - -template -using AgentSelectIf CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " - "interface will be removed.") = - detail::select::AgentSelectIf< - AgentSelectIfPolicyT, - InputIteratorT, - FlagsInputIteratorT, - OutputIteratorWrapperT, - SelectOpT, - EqualityOpT, - OffsetT, - StreamingContextT, - KeepRejects, - MayAlias>; - CUB_NAMESPACE_END diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index b910bb91a2b..9548739885f 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -110,8 +110,8 @@ CUB_NAMESPACE_BEGIN struct DeviceAdjacentDifference { private: - template ; using DispatchT = - DispatchAdjacentDifference; + DispatchAdjacentDifference; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast(num_items), difference_op, stream); @@ -257,10 +257,7 @@ public: { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy"); - constexpr bool may_alias = false; - constexpr bool read_left = true; - - return AdjacentDifference( + return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } @@ -370,10 +367,7 @@ public: { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft"); - constexpr bool may_alias = true; - constexpr bool read_left = true; - - return AdjacentDifference( + return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } @@ -500,10 +494,7 @@ public: { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy"); - constexpr bool may_alias = false; - constexpr bool read_left = false; - - return AdjacentDifference( + return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } @@ -602,10 +593,7 @@ public: { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight"); - constexpr bool may_alias = true; - constexpr bool read_left = false; - - return AdjacentDifference( + return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } }; diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index bc8cd343b18..a13f38591a6 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -190,7 +190,7 @@ struct DeviceCopy RangeOffsetT, BlockOffsetT, detail::batch_memcpy::policy_hub, - false>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream); + CopyAlg::Copy>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream); } }; diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index dae104ce51f..70f96d092dd 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -197,13 +197,13 @@ struct DeviceMemcpy BufferOffsetT, BlockOffsetT, detail::batch_memcpy::policy_hub, - true>::Dispatch(d_temp_storage, - temp_storage_bytes, - input_buffer_it, - output_buffer_it, - buffer_sizes, - num_buffers, - stream); + CopyAlg::Memcpy>::Dispatch(d_temp_storage, + temp_storage_bytes, + input_buffer_it, + output_buffer_it, + buffer_sizes, + num_buffers, + stream); } }; diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 768d8413e6f..b381800ff65 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -200,7 +200,7 @@ struct DevicePartition SelectOp, EqualityOp, OffsetT, - true>; + SelectImpl::Partition>; // Check if the number of items exceeds the range covered by the selected signed offset type cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); @@ -365,7 +365,7 @@ struct DevicePartition SelectOp, EqualityOp, OffsetT, - true>; + SelectImpl::Partition>; return DispatchSelectIfT::Dispatch( d_temp_storage, @@ -381,7 +381,7 @@ struct DevicePartition } private: - template + template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::false_type, void* d_temp_storage, @@ -137,7 +137,7 @@ private: int end_bit, cudaStream_t stream); - template + template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::true_type, void* d_temp_storage, @@ -151,25 +151,21 @@ private: int end_bit, cudaStream_t stream) { - return DispatchRadixSort< - IsDescending, - KeyT, - ValueT, - OffsetT, - detail::radix::policy_hub, - DecomposerT>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - static_cast(num_items), - begin_bit, - end_bit, - is_overwrite_okay, - stream, - decomposer); + return DispatchRadixSort, DecomposerT>:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + static_cast(num_items), + begin_bit, + end_bit, + is_overwrite_okay, + stream, + decomposer); } - template + template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::false_type, void* d_temp_storage, @@ -181,7 +177,7 @@ private: DecomposerT decomposer, cudaStream_t stream); - template + template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::true_type, void* d_temp_storage, @@ -196,7 +192,7 @@ private: constexpr int begin_bit = 0; const int end_bit = detail::radix::traits_t::default_end_bit(decomposer); - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( ::cuda::std::true_type{}, d_temp_storage, temp_storage_bytes, @@ -354,7 +350,7 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -500,11 +496,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -639,11 +634,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -786,7 +780,7 @@ public: constexpr bool is_overwrite_okay = true; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -909,9 +903,8 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = false; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -1055,9 +1048,8 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = false; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -1200,7 +1192,7 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -1340,11 +1332,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -1481,11 +1472,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -1623,7 +1613,7 @@ public: constexpr bool is_overwrite_okay = true; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -1747,9 +1737,8 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = true; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -1894,9 +1883,8 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = true; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -2035,7 +2023,7 @@ public: // Null value type DoubleBuffer d_values; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -2168,11 +2156,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -2297,11 +2284,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -2429,7 +2415,7 @@ public: // Null value type DoubleBuffer d_values; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -2539,10 +2525,9 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = false; DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -2673,10 +2658,9 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = false; DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -2804,7 +2788,7 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -2930,11 +2914,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -3057,11 +3040,10 @@ public: // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; - constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -3184,7 +3166,7 @@ public: // Null value type DoubleBuffer d_values; - return DispatchRadixSort::Dispatch( + return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } @@ -3295,10 +3277,9 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = true; DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, @@ -3430,10 +3411,9 @@ public: "arithmetic types"); constexpr bool is_overwrite_okay = true; - constexpr bool is_descending = true; DoubleBuffer d_values; - return DeviceRadixSort::custom_radix_sort( + return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 0c1638bd955..63129dec5a6 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -1157,7 +1157,6 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; using AccumT = ::cuda::std::__accumulator_t, InitValueT>; - constexpr bool ForceInclusive = true; return DispatchScan< InputIteratorT, @@ -1167,14 +1166,14 @@ struct DeviceScan OffsetT, AccumT, detail::scan::policy_hub, - ForceInclusive>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - detail::InputValue(init_value), - num_items, - stream); + ForceInclusive::Yes>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + detail::InputValue(init_value), + num_items, + stream); } //! @rst diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index ae47119bfa3..ef6d2369f22 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -249,19 +249,20 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); + return DispatchSegmentedRadixSort:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream); } //! @rst @@ -423,19 +424,20 @@ public: // Signed integer type for global offsets using OffsetT = int; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream); + return DispatchSegmentedRadixSort:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream); } //! @rst @@ -597,19 +599,24 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Descending, + KeyT, + ValueT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream); } //! @rst @@ -775,19 +782,24 @@ public: // Signed integer type for global offsets using OffsetT = int; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Descending, + KeyT, + ValueT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream); } //! @} end member group @@ -936,19 +948,24 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Ascending, + KeyT, + NullType, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream); } //! @rst @@ -1102,19 +1119,24 @@ public: // Null value type DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Ascending, + KeyT, + NullType, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream); } //! @rst @@ -1259,19 +1281,24 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Descending, + KeyT, + NullType, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream); } //! @rst @@ -1423,19 +1450,24 @@ public: // Null value type DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream); + return DispatchSegmentedRadixSort< + SortOrder::Descending, + KeyT, + NullType, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream); } //! @} end member group diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 2347666289e..2816cf562ff 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -149,13 +149,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; @@ -325,13 +324,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; @@ -495,12 +493,11 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_values; @@ -666,12 +663,11 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_values; @@ -1388,13 +1384,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); @@ -1591,13 +1586,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); @@ -1788,13 +1782,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, @@ -1987,13 +1980,12 @@ private: EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { - constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using OffsetT = detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 7d5099ca7e1..3cb0c2aeb73 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -190,8 +190,7 @@ struct DeviceSelect SelectOp, EqualityOp, OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ false>::Dispatch(d_temp_storage, + SelectImpl::Select>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, @@ -298,25 +297,25 @@ struct DeviceSelect using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) - return DispatchSelectIf< - IteratorT, - FlagIterator, - IteratorT, - NumSelectedIteratorT, - SelectOp, - EqualityOp, - OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ true>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_data, // in - d_flags, - d_data, // out - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream); + return DispatchSelectIf:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_data, // in + d_flags, + d_data, // out + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream); } //! @rst @@ -445,8 +444,7 @@ struct DeviceSelect SelectOp, EqualityOp, OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ false>::Dispatch(d_temp_storage, + SelectImpl::Select>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, nullptr, @@ -565,27 +563,25 @@ struct DeviceSelect using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) - constexpr bool may_alias = true; - - return DispatchSelectIf< - IteratorT, - FlagIterator, - IteratorT, - NumSelectedIteratorT, - SelectOp, - EqualityOp, - OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ may_alias>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_data, // in - nullptr, - d_data, // out - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream); + return DispatchSelectIf:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_data, // in + nullptr, + d_data, // out + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream); } //! @rst @@ -694,8 +690,7 @@ struct DeviceSelect SelectOp, EqualityOp, OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ false>::Dispatch(d_temp_storage, + SelectImpl::Select>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, @@ -792,25 +787,25 @@ struct DeviceSelect using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) - return DispatchSelectIf< - IteratorT, - FlagIterator, - IteratorT, - NumSelectedIteratorT, - SelectOp, - EqualityOp, - OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ true>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_data, // in - d_flags, - d_data, // out - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream); + return DispatchSelectIf:: + Dispatch( + d_temp_storage, + temp_storage_bytes, + d_data, // in + d_flags, + d_data, // out + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream); } //! @rst @@ -919,8 +914,7 @@ struct DeviceSelect SelectOp, EqualityOp, OffsetT, - /*KeepRejects*/ false, - /*MayAlias*/ false>::Dispatch(d_temp_storage, + SelectImpl::Select>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, nullptr, diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 7c19fce3f52..89cd804ea79 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -74,10 +74,16 @@ struct DeviceTransform return error; } - return detail::transform:: - dispatch_t, RandomAccessIteratorOut, TransformOp>:: - dispatch( - ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + return detail::transform::dispatch_t< + detail::transform::requires_stable_address::no, + offset_t, + ::cuda::std::tuple, + RandomAccessIteratorOut, + TransformOp>::dispatch(::cuda::std::move(inputs), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -212,10 +218,16 @@ struct DeviceTransform return error; } - return detail::transform:: - dispatch_t, RandomAccessIteratorOut, TransformOp>:: - dispatch( - ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + return detail::transform::dispatch_t< + detail::transform::requires_stable_address::yes, + offset_t, + ::cuda::std::tuple, + RandomAccessIteratorOut, + TransformOp>::dispatch(::cuda::std::move(inputs), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 11ea8be82b8..8138f7c087d 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -106,13 +107,19 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( } // namespace detail::adjacent_difference +enum class ReadOption +{ + Left, + Right +}; + template > + MayAlias AliasOpt, + ReadOption ReadOpt, + typename PolicyHub = detail::adjacent_difference::policy_hub> struct DispatchAdjacentDifference { using InputT = typename std::iterator_traits::value_type; @@ -155,10 +162,10 @@ struct DispatchAdjacentDifference constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE; const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); - std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT); + std::size_t first_tile_previous_size = (AliasOpt == MayAlias::Yes) * num_tiles * sizeof(InputT); void* allocations[1] = {nullptr}; - std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size}; + std::size_t allocation_sizes[1] = {(AliasOpt == MayAlias::Yes) * first_tile_previous_size}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); @@ -187,10 +194,10 @@ struct DispatchAdjacentDifference auto first_tile_previous = reinterpret_cast(allocations[0]); - if (MayAlias) + _CCCL_IF_CONSTEXPR (AliasOpt == MayAlias::Yes) { using AgentDifferenceInitT = - detail::adjacent_difference::AgentDifferenceInit; + detail::adjacent_difference::AgentDifferenceInit; constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size); @@ -236,15 +243,14 @@ struct DispatchAdjacentDifference THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) - .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel< - typename PolicyHub::MaxPolicy, - InputIteratorT, - OutputIteratorT, - DifferenceOpT, - OffsetT, - InputT, - MayAlias, - ReadLeft>, + .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel < typename PolicyHub::MaxPolicy, + InputIteratorT, + OutputIteratorT, + DifferenceOpT, + OffsetT, + InputT, + AliasOpt == MayAlias::Yes, + ReadOpt == ReadOption::Left >, d_input, first_tile_previous, d_output, diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 16650f41eef..073105c473c 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -62,6 +62,12 @@ CUB_NAMESPACE_BEGIN +enum class CopyAlg +{ + Memcpy, + Copy +}; + namespace detail { namespace batch_memcpy @@ -92,7 +98,7 @@ template + CopyAlg MemcpyOpt> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel( InputBufferIt input_buffer_it, @@ -106,8 +112,9 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; using BufferSizeT = value_t; /// Internal load/store type. For byte-wise memcpy, a single-byte type - using AliasT = typename ::cuda::std:: - conditional, std::iterator_traits>>::type::value_type; + using AliasT = typename ::cuda::std::conditional, + std::iterator_traits>>::type::value_type; /// Types of the input and output buffers using InputBufferT = value_t; using OutputBufferT = value_t; @@ -164,15 +171,17 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO { if (thread_offset < buffer_sizes[buffer_id]) { - const auto value = read_item(input_buffer_it[buffer_id], thread_offset); - write_item(output_buffer_it[buffer_id], thread_offset, value); + const auto value = read_item < MemcpyOpt == CopyAlg::Memcpy, AliasT, + InputBufferT > (input_buffer_it[buffer_id], thread_offset); + write_item( + output_buffer_it[buffer_id], thread_offset, value); } thread_offset += BLOCK_THREADS; } } else { - copy_items( + copy_items( input_buffer_it[buffer_id], output_buffer_it[buffer_id], (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), @@ -216,7 +225,7 @@ template + CopyAlg MemcpyOpt> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel( InputBufferIt input_buffer_it, @@ -250,7 +259,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO BlockOffsetT, BLevBufferOffsetTileState, BLevBlockOffsetTileState, - IsMemcpy>; + MemcpyOpt == CopyAlg::Memcpy>; // Shared memory for AgentBatchMemcpy __shared__ typename AgentBatchMemcpyT::TempStorage temp_storage; @@ -289,7 +298,7 @@ template , - bool IsMemcpy = true> + CopyAlg MemcpyOpt = CopyAlg::Memcpy> struct DispatchBatchMemcpy { //------------------------------------------------------------------------------ @@ -387,8 +396,10 @@ struct DispatchBatchMemcpy // The number of thread blocks (or tiles) required to process all of the given buffers BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE); - using BlevBufferSrcsOutT = ::cuda::std::_If>; - using BlevBufferDstOutT = ::cuda::std::_If>; + using BlevBufferSrcsOutT = + ::cuda::std::_If>; + using BlevBufferDstOutT = + ::cuda::std::_If>; using BlevBufferSrcsOutItT = BlevBufferSrcsOutT*; using BlevBufferDstsOutItT = BlevBufferDstOutT*; using BlevBufferSizesOutItT = BufferSizeT*; @@ -479,7 +490,7 @@ struct DispatchBatchMemcpy BlockOffsetT, BLevBufferOffsetTileState, BLevBlockOffsetTileState, - IsMemcpy>; + MemcpyOpt>; auto multi_block_memcpy_kernel = detail::batch_memcpy::MultiBlockBatchMemcpyKernel< typename PolicyHub::MaxPolicy, @@ -490,7 +501,7 @@ struct DispatchBatchMemcpy BlevBufferTileOffsetsOutItT, BLevBufferOffsetTileState, BlockOffsetT, - IsMemcpy>; + MemcpyOpt>; constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS; diff --git a/cub/cub/device/dispatch/dispatch_common.cuh b/cub/cub/device/dispatch/dispatch_common.cuh new file mode 100644 index 00000000000..a9ad5ad39ff --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_common.cuh @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +CUB_NAMESPACE_BEGIN + +// Options for specifying memory aliasing +enum class MayAlias +{ + Yes, + No +}; + +// Options for specifying sorting order. +enum class SortOrder +{ + Ascending, + Descending +}; + +// Options for specifying the behavior of the stream compaction algorithm. +enum class SelectImpl +{ + // Stream compaction, discarding rejected items. It's guaranteed that memory of input and output are disjoint. + Select, + // Stream compaction, discarding rejected items. Memory of the input may be identical to the memory of the output. + SelectPotentiallyInPlace, + // Partition, keeping rejected items. It's guaranteed that memory of input and output are disjoint. + Partition +}; + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index d0a08110618..e6081e56528 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -90,8 +91,8 @@ namespace detail::radix_sort * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether to sort in ascending or descending order * * @tparam KeyT * Key type @@ -120,7 +121,7 @@ namespace detail::radix_sort */ template @@ -168,7 +169,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp __syncthreads(); // Write out digit counts (striped) - upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); + upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); } /** @@ -234,8 +235,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether to sort in ascending or descending order * * @tparam KeyT * Key type @@ -276,7 +277,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), */ template ; + AgentRadixSortDownsweep; // Shared memory storage __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; @@ -334,8 +335,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo * @tparam ChainedPolicyT * Chained tuning policy * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether or not to use the alternate (lower-bits) policy * * @tparam KeyT * Key type @@ -368,7 +369,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo * The past-the-end (most-significant) bit index needed for key comparison */ template (default_key_bits); @@ -453,7 +454,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE // Sort tile BlockRadixSortT(temp_storage.sort) .SortBlockedToStriped( - keys, values, current_bit, end_bit, Int2Type(), Int2Type(), decomposer); + keys, values, current_bit, end_bit, Int2Type(), Int2Type(), decomposer); // Store keys and values #pragma unroll @@ -480,8 +481,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether to sort in ascending or descending order * * @tparam KeyT * Key type @@ -533,7 +534,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE */ template ; // Downsweep type - using BlockDownsweepT = - detail::radix_sort::AgentRadixSortDownsweep; + using BlockDownsweepT = detail::radix_sort:: + AgentRadixSortDownsweep; enum { @@ -629,7 +630,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen __syncthreads(); - if (IS_DESCENDING) + if (Order == SortOrder::Descending) { // Reverse bin counts #pragma unroll @@ -668,7 +669,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen bin_offset[track] += segment_begin; } - if (IS_DESCENDING) + if (Order == SortOrder::Descending) { // Reverse bin offsets #pragma unroll @@ -725,7 +726,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen * Histogram kernel */ template @@ -734,15 +735,15 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {}) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; - using AgentT = - detail::radix_sort::AgentRadixSortHistogram; + using AgentT = detail::radix_sort:: + AgentRadixSortHistogram; __shared__ typename AgentT::TempStorage temp_storage; AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer); agent.Process(); } template ; + using AgentT = detail::radix_sort::AgentRadixSortOnesweep< + OnesweepPolicyT, + Order == SortOrder::Descending, + KeyT, + ValueT, + OffsetT, + PortionOffsetT, + DecomposerT>; __shared__ typename AgentT::TempStorage s; AgentT agent( @@ -839,8 +846,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b /** * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether to sort in ascending or descending order * * @tparam KeyT * Key type @@ -855,7 +862,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ -template ; + detail::radix_sort::DeviceRadixSortHistogramKernel; error = CubDebug( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0)); @@ -1406,7 +1413,7 @@ struct DispatchRadixSort auto onesweep_kernel = detail::radix_sort::DeviceRadixSortOnesweepKernel< max_policy_t, - IS_DESCENDING, + Order, KeyT, ValueT, OffsetT, @@ -1655,13 +1662,11 @@ struct DispatchRadixSort { // Invoke upsweep-downsweep return InvokePasses( - detail::radix_sort::DeviceRadixSortUpsweepKernel, - detail::radix_sort::DeviceRadixSortUpsweepKernel, + detail::radix_sort::DeviceRadixSortUpsweepKernel, + detail::radix_sort::DeviceRadixSortUpsweepKernel, detail::radix_sort::RadixSortScanBinsKernel, - detail::radix_sort:: - DeviceRadixSortDownsweepKernel, - detail::radix_sort:: - DeviceRadixSortDownsweepKernel); + detail::radix_sort::DeviceRadixSortDownsweepKernel, + detail::radix_sort::DeviceRadixSortDownsweepKernel); } template @@ -1761,8 +1766,7 @@ struct DispatchRadixSort { // Small, single tile size return InvokeSingleTile( - detail::radix_sort:: - DeviceRadixSortSingleTileKernel); + detail::radix_sort::DeviceRadixSortSingleTileKernel); } else { @@ -1866,8 +1870,8 @@ struct DispatchRadixSort * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide * radix sort * - * @tparam IS_DESCENDING - * Whether or not the sorted-order is high-to-low + * @tparam SortOrder + * Whether to sort in ascending or descending order * * @tparam KeyT * Key type @@ -1884,7 +1888,7 @@ struct DispatchRadixSort * @tparam OffsetT * Signed integer type for global offsets */ -template + ForceInclusive EnforceInclusive> struct DeviceScanKernelSource { using ScanTileStateT = typename cub::ScanTileState; @@ -90,7 +95,7 @@ struct DeviceScanKernelSource InitValueT, OffsetT, AccumT, - ForceInclusive>) + EnforceInclusive == ForceInclusive::Yes>) CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize() { @@ -124,8 +129,8 @@ struct DeviceScanKernelSource * @tparam OffsetT * Unsigned integer type for global offsets * - * @tparam ForceInclusive - * Boolean flag to force InclusiveScan invocation when true. + * @tparam EnforceInclusive + * Enum flag to specify whether to enforce inclusive scan. * */ template , - ::cuda::std::_If<::cuda::std::is_same_v, - cub::detail::value_t, - typename InitValueT::value_type>>, - typename PolicyHub = detail::scan::policy_hub, - bool ForceInclusive = false, - typename KernelSource = detail::scan::DeviceScanKernelSource< - typename PolicyHub::MaxPolicy, - InputIteratorT, - OutputIteratorT, - ScanOpT, - InitValueT, - OffsetT, - AccumT, - ForceInclusive>, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If<::cuda::std::is_same_v, + cub::detail::value_t, + typename InitValueT::value_type>>, + typename PolicyHub = detail::scan::policy_hub, + ForceInclusive EnforceInclusive = ForceInclusive::No, + typename KernelSource = detail::scan::DeviceScanKernelSource< + typename PolicyHub::MaxPolicy, + InputIteratorT, + OutputIteratorT, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + EnforceInclusive>, typename KernelLauncherFactory = detail::TripleChevronFactory> struct DispatchScan diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index e445ebcb5ad..f742914e5b4 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -148,7 +148,7 @@ _CCCL_HOST_DEVICE OffsetIteratorT make_offset_iterator(cons * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. */ -template ; + radix_sort::AgentSegmentedRadixSort; using WarpReduceT = cub::WarpReduce; using AgentWarpMergeSortT = - sub_warp_merge_sort::AgentSubWarpSort; + sub_warp_merge_sort::AgentSubWarpSort; __shared__ union { @@ -325,7 +325,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ -template (SmallPolicyT::WARP_THREADS); using MediumAgentWarpMergeSortT = - sub_warp_merge_sort::AgentSubWarpSort; + sub_warp_merge_sort::AgentSubWarpSort; using SmallAgentWarpMergeSortT = - sub_warp_merge_sort::AgentSubWarpSort; + sub_warp_merge_sort::AgentSubWarpSort; constexpr auto segments_per_medium_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); @@ -449,7 +449,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ -template ; + radix_sort::AgentSegmentedRadixSort; __shared__ typename AgentSegmentedRadixSortT::TempStorage storage; @@ -748,7 +748,7 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin } // namespace detail::segmented_sort -template ( detail::segmented_sort::DeviceSegmentedSortKernelLarge< - IS_DESCENDING, + Order, MaxPolicyT, KeyT, ValueT, @@ -1086,7 +1086,7 @@ struct DispatchSegmentedSort StreamingEndOffsetIteratorT, OffsetT>, detail::segmented_sort::DeviceSegmentedSortKernelSmall< - IS_DESCENDING, + Order, MaxPolicyT, KeyT, ValueT, @@ -1110,7 +1110,7 @@ struct DispatchSegmentedSort error = SortWithoutPartitioning( detail::segmented_sort::DeviceSegmentedSortFallbackKernel< - IS_DESCENDING, + Order, MaxPolicyT, KeyT, ValueT, diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index c6f2acc993a..a0f37e796b1 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -45,6 +45,7 @@ #endif // no system header #include +#include #include #include #include @@ -191,7 +192,7 @@ public: /** * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`. */ -template +template struct agent_select_if_wrapper_t { // Using an explicit list of template parameters forwarded to AgentSelectIf, since MSVC complains about a template @@ -213,8 +214,7 @@ struct agent_select_if_wrapper_t EqualityOpT, OffsetT, StreamingContextT, - KeepRejects, - MayAlias> + SelectionOpt> { using AgentSelectIf::AgentSelectIf; + SelectionOpt>::AgentSelectIf; }; }; @@ -321,12 +320,11 @@ template + SelectImpl SelectionOpt> __launch_bounds__(int( vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, - agent_select_if_wrapper_t::template agent_t, + agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -349,7 +347,7 @@ __launch_bounds__(int( { using VsmemHelperT = vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, - agent_select_if_wrapper_t::template agent_t, + agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -409,8 +407,9 @@ __launch_bounds__(int( * @tparam OffsetT * Signed integer type for global offsets * - * @tparam KeepRejects - * Whether or not we push rejected items to the back of the output + * @tparam SelectImpl SelectionOpt + * SelectImpl indicating whether to partition, just selection or selection where the memory for the input and + * output may alias each other. */ template , cub::detail::value_t, detail::select::per_partition_offset_t, - MayAlias, - KeepRejects>> + (SelectionOpt == SelectImpl::SelectPotentiallyInPlace), + (SelectionOpt == SelectImpl::Partition)>> struct DispatchSelectIf { /****************************************************************************** @@ -561,7 +559,7 @@ struct DispatchSelectIf using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< Policy, - detail::select::agent_select_if_wrapper_t::template agent_t, + detail::select::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -764,8 +762,7 @@ struct DispatchSelectIf EqualityOpT, per_partition_offset_t, streaming_context_t, - KeepRejects, - MayAlias>); + SelectionOpt>); } /** diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 5e796e14e2d..b81eb164c37 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -49,6 +49,13 @@ CUB_NAMESPACE_BEGIN namespace detail::transform { + +enum class requires_stable_address +{ + no, + yes +}; + template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment) { @@ -608,21 +615,21 @@ struct prefetch_config int sm_count; }; -template > + typename PolicyHub = policy_hub> struct dispatch_t; -template -struct dispatch_t, RandomAccessIteratorOut, diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu index 224caf9a43b..ba2538a4976 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu @@ -50,7 +50,7 @@ // TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is // addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset // types -template +template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper( void* d_temp_storage, size_t& temp_storage_bytes, @@ -69,7 +69,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad cub::DoubleBuffer d_values; cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); auto status = cub::DispatchSegmentedRadixSort< - IS_DESCENDING, + Order, KeyT, cub::NullType, BeginOffsetIteratorT, @@ -103,8 +103,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys); DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, dispatch_segmented_radix_sort_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, dispatch_segmented_radix_sort); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, + dispatch_segmented_radix_sort_descending); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper, dispatch_segmented_radix_sort); // TODO: // - int128 diff --git a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu index a5b7529f3d1..1c88eb69452 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu @@ -46,7 +46,7 @@ // TODO replace with DeviceSegmentedRadixSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is // addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset // types -template d_keys(const_cast(d_keys_in), d_keys_out); cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); auto status = cub::DispatchSegmentedRadixSort< - IS_DESCENDING, + Order, KeyT, ValueT, BeginOffsetIteratorT, @@ -105,9 +105,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairs, sort_pairs); DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairsDescending, sort_pairs_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, dispatch_segmented_radix_sort_pairs_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, dispatch_segmented_radix_sort_pairs); +DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper, + dispatch_segmented_radix_sort_pairs); using custom_value_t = c2h::custom_type_t; using value_types = c2h::type_list; diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 95c4794b8cf..1bb00efd1ad 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -63,8 +63,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point( return cudaSuccess; } - constexpr bool RequiresStableAddress = false; - return cub::detail::transform::dispatch_t, RandomAccessIteratorOut, diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h index 3372b9eaf25..c62aadfc14d 100644 --- a/thrust/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h @@ -73,7 +73,7 @@ namespace cuda_cub namespace __adjacent_difference { -template +template cudaError_t THRUST_RUNTIME_FUNCTION doit_step( void* d_temp_storage, size_t& temp_storage_bytes, @@ -88,11 +88,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( return cudaSuccess; } - constexpr bool may_alias = MayAlias; - constexpr bool read_left = true; + constexpr cub::ReadOption read_left = cub::ReadOption::Left; - using Dispatch32 = cub::DispatchAdjacentDifference; - using Dispatch64 = cub::DispatchAdjacentDifference; + using Dispatch32 = cub::DispatchAdjacentDifference; + using Dispatch64 = cub::DispatchAdjacentDifference; cudaError_t status; THRUST_INDEX_TYPE_DISPATCH2( @@ -115,8 +114,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( cudaStream_t stream, thrust::detail::integral_constant /* comparable */) { - constexpr bool may_alias = true; - return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); + return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); } template @@ -135,12 +133,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // `num_items`. In the latter case, we use an optimized version. if (first != result) { - constexpr bool may_alias = false; - return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); + return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); } - constexpr bool may_alias = true; - return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); + return doit_step(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream); } template diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h index a8fa4dd9faf..f723a1e31c0 100644 --- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h @@ -136,7 +136,6 @@ unique_eager_event async_inclusive_scan_n( using InputValueT = cub::detail::InputValue; using AccumT = typename ::cuda::std:: __accumulator_t::value_type, InitialValueType>; - constexpr bool ForceInclusive = true; using Dispatch32 = cub::DispatchScan, - ForceInclusive>; + cub::ForceInclusive::Yes>; using Dispatch64 = cub::DispatchScan, - ForceInclusive>; + cub::ForceInclusive::Yes>; InputValueT init_value(init); diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h index a0e5f6e254d..4b1577f6bb6 100644 --- a/thrust/thrust/system/cuda/detail/copy_if.h +++ b/thrust/thrust/system/cuda/detail/copy_if.h @@ -87,16 +87,7 @@ namespace cuda_cub namespace detail { -/** - * Enum class to indicate whether the memory of input and output iterators potentially alias one another. - */ -enum class InputMayAliasOutput -{ - no, - yes -}; - -template ::Dispatch(nullptr, - allocation_sizes[0], - first, - stencil, - output, - static_cast(nullptr), - predicate, - equality_op_t{}, - num_items, - stream); + status = cub:: + DispatchSelectIf:: + Dispatch( + nullptr, + allocation_sizes[0], + first, + stencil, + output, + static_cast(nullptr), + predicate, + equality_op_t{}, + num_items, + stream); CUDA_CUB_RET_IF_FAIL(status); status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); @@ -169,25 +150,19 @@ struct DispatchCopyIf OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); // Run algorithm - status = cub::DispatchSelectIf< - InputIt, - StencilIt, - OutputIt, - num_selected_out_it_t, - Predicate, - equality_op_t, - OffsetT, - keep_rejects, - may_alias>::Dispatch(allocations[0], - allocation_sizes[0], - first, - stencil, - output, - d_num_selected_out, - predicate, - equality_op_t{}, - num_items, - stream); + status = cub:: + DispatchSelectIf:: + Dispatch( + allocations[0], + allocation_sizes[0], + first, + stencil, + output, + d_num_selected_out, + predicate, + equality_op_t{}, + num_items, + stream); CUDA_CUB_RET_IF_FAIL(status); // Get number of selected items @@ -199,7 +174,7 @@ struct DispatchCopyIf } }; -template ; + using dispatch64_t = DispatchCopyIf; // Query temporary storage requirements status = dispatch64_t::dispatch( @@ -253,7 +228,7 @@ template & policy, InputIterator first, InputIterator last, OutputIterator result, Predicate pred) { - THRUST_CDP_DISPATCH((return detail::copy_if( + THRUST_CDP_DISPATCH((return detail::copy_if( policy, first, last, static_cast(nullptr), result, pred);), (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, result, pred);)); } @@ -268,9 +243,8 @@ OutputIterator _CCCL_HOST_DEVICE copy_if( OutputIterator result, Predicate pred) { - THRUST_CDP_DISPATCH( - (return detail::copy_if(policy, first, last, stencil, result, pred);), - (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, result, pred);)); + THRUST_CDP_DISPATCH((return detail::copy_if(policy, first, last, stencil, result, pred);), + (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, result, pred);)); } } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h index 152381af039..36f69ad7d6e 100644 --- a/thrust/thrust/system/cuda/detail/partition.h +++ b/thrust/thrust/system/cuda/detail/partition.h @@ -87,10 +87,6 @@ struct DispatchPartitionIf std::size_t allocation_sizes[2] = {0, sizeof(OffsetT)}; void* allocations[2] = {nullptr, nullptr}; - // Partitioning algorithm keeps "rejected" items - constexpr bool keep_rejects = true; - constexpr bool may_alias = false; - // Query algorithm memory requirements status = cub::DispatchSelectIf< InputIt, @@ -100,17 +96,16 @@ struct DispatchPartitionIf Predicate, equality_op_t, OffsetT, - keep_rejects, - may_alias>::Dispatch(nullptr, - allocation_sizes[0], - first, - stencil, - output, - static_cast(nullptr), - predicate, - equality_op_t{}, - num_items, - stream); + cub::SelectImpl::Partition>::Dispatch(nullptr, + allocation_sizes[0], + first, + stencil, + output, + static_cast(nullptr), + predicate, + equality_op_t{}, + num_items, + stream); CUDA_CUB_RET_IF_FAIL(status); status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); @@ -141,17 +136,16 @@ struct DispatchPartitionIf Predicate, equality_op_t, OffsetT, - keep_rejects, - may_alias>::Dispatch(allocations[0], - allocation_sizes[0], - first, - stencil, - output, - d_num_selected_out, - predicate, - equality_op_t{}, - num_items, - stream); + cub::SelectImpl::Partition>::Dispatch(allocations[0], + allocation_sizes[0], + first, + stencil, + output, + d_num_selected_out, + predicate, + equality_op_t{}, + num_items, + stream); CUDA_CUB_RET_IF_FAIL(status); // Get number of selected items diff --git a/thrust/thrust/system/cuda/detail/remove.h b/thrust/thrust/system/cuda/detail/remove.h index 99340801941..76d23eeb81d 100644 --- a/thrust/thrust/system/cuda/detail/remove.h +++ b/thrust/thrust/system/cuda/detail/remove.h @@ -50,7 +50,7 @@ template InputIt _CCCL_HOST_DEVICE remove_if(execution_policy& policy, InputIt first, InputIt last, StencilIt stencil, Predicate predicate) { - THRUST_CDP_DISPATCH((return cuda_cub::detail::copy_if( + THRUST_CDP_DISPATCH((return cuda_cub::detail::copy_if( policy, first, last, stencil, first, thrust::not_fn(predicate));), (return thrust::remove_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, predicate);)); } @@ -60,7 +60,7 @@ template InputIt _CCCL_HOST_DEVICE remove_if(execution_policy& policy, InputIt first, InputIt last, Predicate predicate) { THRUST_CDP_DISPATCH( - (return cuda_cub::detail::copy_if( + (return cuda_cub::detail::copy_if( policy, first, last, static_cast(nullptr), first, thrust::not_fn(predicate));), (return thrust::remove_if(cvt_to_seq(derived_cast(policy)), first, last, predicate);)); } diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index 890e02ba198..ff7c0111343 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -121,7 +121,6 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( { using InputValueT = cub::detail::InputValue; using AccumT = typename ::cuda::std::__accumulator_t, InitValueT>; - constexpr bool ForceInclusive = true; using Dispatch32 = cub::DispatchScan, - ForceInclusive>; + cub::ForceInclusive::Yes>; using Dispatch64 = cub::DispatchScan, - ForceInclusive>; + cub::ForceInclusive::Yes>; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 64154a8f6da..2baf4810550 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -255,16 +255,17 @@ OutputIt THRUST_FUNCTION cub_transform_many( return result; } - constexpr auto requires_stable_address = !::cuda::proclaims_copyable_arguments::value; + constexpr auto stable_address = + (::cuda::proclaims_copyable_arguments::value) + ? cub::detail::transform::requires_stable_address::no + : cub::detail::transform::requires_stable_address::yes; cudaError_t status; THRUST_INDEX_TYPE_DISPATCH( status, - (cub::detail::transform::dispatch_t, - OutputIt, - TransformOp>::dispatch), + (cub::detail::transform:: + dispatch_t, OutputIt, TransformOp>:: + dispatch), num_items, (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy))); throw_on_error(status, "transform: failed inside CUB");