diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 5690371d3fb..af085f65274 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -60,8 +60,6 @@ #include #include -#include -#include #include #include @@ -70,7 +68,10 @@ CUB_NAMESPACE_BEGIN -namespace detail::segmented_sort +namespace detail +{ + +namespace segmented_sort { // Type used to index within segments within a single invocation using local_segment_index_t = ::cuda::std::uint32_t; @@ -107,6 +108,8 @@ _CCCL_HOST_DEVICE OffsetIteratorT make_offset_iterator(cons { return OffsetIteratorT{it, offset_it}; } +} // namespace segmented_sort +} // namespace detail /** * @brief Fallback kernel, in case there's not enough segments to @@ -170,7 +173,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using MediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT; - const auto segment_id = static_cast(blockIdx.x); + const auto segment_id = static_cast(blockIdx.x); OffsetT segment_begin = d_begin_offsets[segment_id]; OffsetT segment_end = d_end_offsets[segment_id]; OffsetT num_items = segment_end - segment_begin; @@ -334,11 +337,11 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall( - local_segment_index_t small_segments, - local_segment_index_t medium_segments, - local_segment_index_t medium_blocks, - const local_segment_index_t* d_small_segments_indices, - const local_segment_index_t* d_medium_segments_indices, + detail::segmented_sort::local_segment_index_t small_segments, + detail::segmented_sort::local_segment_index_t medium_segments, + detail::segmented_sort::local_segment_index_t medium_blocks, + const detail::segmented_sort::local_segment_index_t* d_small_segments_indices, + const detail::segmented_sort::local_segment_index_t* d_medium_segments_indices, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, @@ -346,7 +349,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { - using local_segment_index_t = local_segment_index_t; + using local_segment_index_t = detail::segmented_sort::local_segment_index_t; const local_segment_index_t tid = threadIdx.x; const local_segment_index_t bid = blockIdx.x; @@ -458,7 +461,7 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge( - const local_segment_index_t* d_segments_indices, + const detail::segmented_sort::local_segment_index_t* d_segments_indices, const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, device_double_buffer d_keys_double_buffer, @@ -470,7 +473,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; - using local_segment_index_t = local_segment_index_t; + using local_segment_index_t = detail::segmented_sort::local_segment_index_t; constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; @@ -577,12 +580,12 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, - local_segment_index_t* group_sizes, - local_segment_index_t* large_and_medium_segments_indices, - local_segment_index_t* small_segments_indices, + detail::segmented_sort::local_segment_index_t* group_sizes, + detail::segmented_sort::local_segment_index_t* large_and_medium_segments_indices, + detail::segmented_sort::local_segment_index_t* small_segments_indices, cudaStream_t stream) { - using local_segment_index_t = local_segment_index_t; + using local_segment_index_t = detail::segmented_sort::local_segment_index_t; cudaError error = cudaSuccess; @@ -699,7 +702,7 @@ template d_keys_double_buffer, @@ -708,9 +711,9 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, - local_segment_index_t* group_sizes, - local_segment_index_t* large_and_medium_segments_indices, - local_segment_index_t* small_segments_indices) + detail::segmented_sort::local_segment_index_t* group_sizes, + detail::segmented_sort::local_segment_index_t* large_and_medium_segments_indices, + detail::segmented_sort::local_segment_index_t* small_segments_indices) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; @@ -961,7 +964,7 @@ struct DispatchSegmentedSort constexpr auto num_segments_per_invocation_limit = static_cast(::cuda::std::numeric_limits::max()); auto const max_num_segments_per_invocation = static_cast( - (::cuda::std::min)(static_cast(num_segments), num_segments_per_invocation_limit)); + ::cuda::std::min(static_cast(num_segments), num_segments_per_invocation_limit)); large_and_medium_segments_indices.grow(max_num_segments_per_invocation); small_segments_indices.grow(max_num_segments_per_invocation); @@ -1078,22 +1081,20 @@ struct DispatchSegmentedSort // Partition input segments into size groups and assign specialized // kernels for each of them. error = SortWithPartitioning( - detail::segmented_sort::DeviceSegmentedSortKernelLarge< - IS_DESCENDING, - MaxPolicyT, - KeyT, - ValueT, - StreamingBeginOffsetIteratorT, - StreamingEndOffsetIteratorT, - OffsetT>, - detail::segmented_sort::DeviceSegmentedSortKernelSmall< - IS_DESCENDING, - MaxPolicyT, - KeyT, - ValueT, - StreamingBeginOffsetIteratorT, - StreamingEndOffsetIteratorT, - OffsetT>, + DeviceSegmentedSortKernelLarge, + DeviceSegmentedSortKernelSmall, three_way_partition_temp_storage_bytes, d_keys_double_buffer, d_values_double_buffer, diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index 0297f8f80e3..eccb4cbcad4 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -80,7 +80,7 @@ struct segment_index_to_offset_op OffsetT segment_size; OffsetT num_items; - _CCCL_HOST_DEVICE __forceinline__ OffsetT operator()(SegmentIndexT i) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT operator()(SegmentIndexT i) { if (i < num_empty_segments) { @@ -103,16 +103,15 @@ struct mod_n std::size_t mod; template - _CCCL_HOST_DEVICE __forceinline__ T operator()(IndexT x) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T operator()(IndexT x) { return static_cast(x % mod); } }; template -class short_key_verification_helper +struct short_key_verification_helper { -private: using key_t = KeyT; // The histogram size of the keys being sorted for later verification const std::int64_t max_histo_size = std::int64_t{1} << ::cuda::std::numeric_limits::digits; diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu index e1141129e2e..3d392e8e8f6 100644 --- a/cub/test/catch2_test_device_segmented_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_sort_keys.cu @@ -178,8 +178,6 @@ C2H_TEST("DeviceSegmentedSortKeys: Unspecified segments, random keys", "[keys][s test_unspecified_segments_random(C2H_SEED(4)); } -#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) - C2H_TEST("DeviceSegmentedSortKeys: very large number of segments", "[keys][segmented][sort][device]", all_offset_types) try { diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu index 18ec30b7a7b..f24d30dbed1 100644 --- a/cub/test/catch2_test_device_segmented_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu @@ -199,8 +199,6 @@ C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values", test_unspecified_segments_random(C2H_SEED(4)); } -#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) - C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments", "[pairs][segmented][sort][device]", all_offset_types)