Skip to content

Commit

Permalink
Adds support for large number of items and large number of segments t…
Browse files Browse the repository at this point in the history
…o `DeviceSegmentedSort` (#3308)

* fixes segment offset generation

* switches to analytical verification

* switches to analytical verification for pairs

* addresses review comments

* introduces segment offset type

* adds tests for large number of segments

* adds support for large number of segments

* drops segment offset type

* fixes thrust namespace

* removes about-to-be-deprecated cub iterators

* no exec specifier on defaulted ctor

* fixes gcc7 linker error

* uses local_segment_index_t throughout

* determine offset type based on type returned by segment iterator begin/end iterators

* minor style improvements
  • Loading branch information
elstehle authored Jan 14, 2025
1 parent d2da787 commit 08420d4
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 174 deletions.
143 changes: 85 additions & 58 deletions cub/cub/device/device_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,13 @@
# pragma system_header
#endif // no system header

#include <cub/detail/choose_offset.cuh>
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_segmented_sort.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/std/cstdint>

CUB_NAMESPACE_BEGIN

//! @rst
Expand Down Expand Up @@ -140,16 +143,19 @@ private:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = false;
constexpr bool is_overwrite_okay = false;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
Expand Down Expand Up @@ -286,8 +292,8 @@ public:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand All @@ -313,16 +319,19 @@ private:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = true;
constexpr bool is_overwrite_okay = false;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
Expand Down Expand Up @@ -454,8 +463,8 @@ public:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand All @@ -480,17 +489,18 @@ private:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = false;
constexpr bool is_overwrite_okay = true;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<NullType> d_values;

Expand Down Expand Up @@ -632,8 +642,8 @@ public:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand All @@ -650,17 +660,18 @@ private:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = true;
constexpr bool is_overwrite_okay = true;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<NullType> d_values;

Expand Down Expand Up @@ -803,8 +814,8 @@ public:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -931,8 +942,8 @@ public:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -1067,8 +1078,8 @@ public:
std::size_t& temp_storage_bytes,
const KeyT* d_keys_in,
KeyT* d_keys_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -1213,8 +1224,8 @@ public:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -1350,8 +1361,8 @@ public:
void* d_temp_storage,
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand All @@ -1371,15 +1382,19 @@ private:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = false;
constexpr bool is_overwrite_okay = false;
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
Expand Down Expand Up @@ -1539,8 +1554,8 @@ public:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -1570,15 +1585,19 @@ private:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = true;
constexpr bool is_overwrite_okay = false;
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
Expand Down Expand Up @@ -1734,8 +1753,8 @@ public:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -1763,15 +1782,19 @@ private:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = false;
constexpr bool is_overwrite_okay = true;
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

return DispatchT::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -1931,8 +1954,8 @@ public:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand All @@ -1958,15 +1981,19 @@ private:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
constexpr bool is_descending = true;
constexpr bool is_overwrite_okay = true;
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;

using OffsetT =
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
using DispatchT =
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;

return DispatchT::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -2125,8 +2152,8 @@ public:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -2281,8 +2308,8 @@ public:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -2439,8 +2466,8 @@ public:
KeyT* d_keys_out,
const ValueT* d_values_in,
ValueT* d_values_out,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -2605,8 +2632,8 @@ public:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down Expand Up @@ -2768,8 +2795,8 @@ public:
std::size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
::cuda::std::int64_t num_items,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
Expand Down
Loading

0 comments on commit 08420d4

Please sign in to comment.