Skip to content

Commit

Permalink
Improves DeviceSegmentedSort test run time for large number of item…
Browse files Browse the repository at this point in the history
…s and segments (#3246)

* fixes segment offset generation

* switches to analytical verification

* switches to analytical verification for pairs

* fixes spelling

* adds tests for large number of segments

* fixes narrowing conversion in tests

* addresses review comments

* fixes includes
  • Loading branch information
elstehle authored Jan 14, 2025
1 parent d5d3aa6 commit 64a419a
Show file tree
Hide file tree
Showing 4 changed files with 312 additions and 360 deletions.
31 changes: 5 additions & 26 deletions cub/test/catch2_radix_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <thrust/sequence.h>

#include <cuda/std/bit>
#include <cuda/std/functional>

#include <array>
#include <climits>
Expand All @@ -54,43 +55,21 @@
// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
using all_offset_types =
c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
c2h::type_list<cuda::std::int64_t, cuda::std::uint64_t, cuda::std::int32_t, cuda::std::uint32_t>;

// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
template <typename OffsetT, OffsetT Step>
struct segment_iterator
{
OffsetT last = 0;

segment_iterator(OffsetT last1)
segment_iterator(std::int64_t last1)
: last{last1}
{}

__host__ __device__ OffsetT operator()(OffsetT x) const
__host__ __device__ OffsetT operator()(std::int64_t x) const
{
switch (x)
{
case Step * 100:
return Step * 100 + Step / 2;
case Step * 200:
return Step * 200 + Step / 2;
case Step * 300:
return Step * 300 + Step / 2;
case Step * 400:
return Step * 400 + Step / 2;
case Step * 500:
return Step * 500 + Step / 2;
case Step * 600:
return Step * 600 + Step / 2;
case Step * 700:
return Step * 700 + Step / 2;
case Step * 800:
return Step * 800 + Step / 2;
case Step * 900:
return Step * 900 + Step / 2;
default:
return (x >= last) ? last : x * Step;
}
return ::cuda::std::min(last, x * Step);
}
};

Expand Down
192 changes: 191 additions & 1 deletion cub/test/catch2_segmented_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/unique.h>

#include <cuda/std/limits>
#include <cuda/std/tuple>
Expand All @@ -46,11 +47,11 @@

#include <cstdio>

#include "catch2_test_launch_helper.h"
#include <c2h/catch2_test_helper.h>
#include <c2h/cpu_timer.h>
#include <c2h/extended_types.h>
#include <c2h/utility.h>
#include <catch2_test_launch_helper.h>
#include <nv/target>

#define MAKE_SEED_MOD_FUNCTION(name, xor_mask) \
Expand All @@ -71,6 +72,195 @@ MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333)

#undef MAKE_SEED_MOD_FUNCTION

// Helper to generate a certain number of empty segments followed by equi-sized segments.
template <typename OffsetT, typename SegmentIndexT>
struct segment_index_to_offset_op
{
SegmentIndexT num_empty_segments;
SegmentIndexT num_segments;
OffsetT segment_size;
OffsetT num_items;

_CCCL_HOST_DEVICE __forceinline__ OffsetT operator()(SegmentIndexT i)
{
if (i < num_empty_segments)
{
return 0;
}
else if (i < num_segments)
{
return segment_size * static_cast<OffsetT>(i - num_empty_segments);
}
else
{
return num_items;
}
}
};

template <typename T>
struct mod_n
{
std::size_t mod;

template <typename IndexT>
_CCCL_HOST_DEVICE __forceinline__ T operator()(IndexT x)
{
return static_cast<T>(x % mod);
}
};

template <typename KeyT>
class short_key_verification_helper
{
private:
using key_t = KeyT;
// The histogram size of the keys being sorted for later verification
const std::int64_t max_histo_size = std::int64_t{1} << ::cuda::std::numeric_limits<key_t>::digits;

// Holding the histogram of the keys being sorted for verification
c2h::host_vector<std::size_t> keys_histogram{};

public:
void prepare_verification_data(const c2h::device_vector<key_t>& in_keys)
{
c2h::host_vector<key_t> h_in{in_keys};
keys_histogram = c2h::host_vector<std::size_t>(max_histo_size, 0);
for (const auto& key : h_in)
{
keys_histogram[key]++;
}
}

void verify_sorted(const c2h::device_vector<key_t>& out_keys) const
{
// Verify keys are sorted next to each other
auto count = thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>());
REQUIRE(count <= max_histo_size);

// Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

for (int i = 0; i < count; i++)
{
auto const next_end = (i == count - 1) ? out_keys.size() : unique_indexes_out[i + 1];
REQUIRE(keys_histogram[unique_keys_out[i]] == next_end - unique_indexes_out[i]);
}
}
};

template <typename KeyT>
class segmented_verification_helper
{
private:
using key_t = KeyT;
const std::size_t sequence_length{};

// Analytically computes the histogram for a segment of a series of keys: [0, 1, 2, ..., mod_n - 1, 0, 1, 2, ...].
// `segment_end` is one-past-the-end of the segment to compute the histogram for.
c2h::host_vector<int> compute_histogram_of_series(std::size_t segment_offset, std::size_t segment_end) const
{
// The i-th full cycle begins after segment_offset
const auto start_cycle = cuda::ceil_div(segment_offset, sequence_length);

// The last full cycle ending before segment_end
const auto end_cycle = segment_end / sequence_length;

// Number of full cycles repeating the sequence
const int full_cycles = (end_cycle > start_cycle) ? static_cast<int>(end_cycle - start_cycle) : 0;

// Add contributions from full cycles
c2h::host_vector<int> histogram(sequence_length, full_cycles);

// Partial cycles preceding the first full cycle
for (std::size_t j = segment_offset; j < start_cycle * sequence_length; ++j)
{
const auto value = j % sequence_length;
histogram[value]++;
}

// Partial cycles following the last full cycle
for (std::size_t j = end_cycle * sequence_length; j < segment_end; ++j)
{
const auto value = j % sequence_length;
histogram[value]++;
}
return histogram;
}

public:
segmented_verification_helper(int sequence_length)
: sequence_length(sequence_length)
{}

void prepare_input_data(c2h::device_vector<key_t>& in_keys) const
{
auto data_gen_it =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), mod_n<key_t>{sequence_length});
thrust::copy_n(data_gen_it, in_keys.size(), in_keys.begin());
}

template <typename SegmentOffsetItT>
void verify_sorted(c2h::device_vector<key_t>& out_keys, SegmentOffsetItT offsets, std::size_t num_segments) const
{
// The segments' end-offsets are provided by the segments' begin-offset iterator
auto offsets_plus_1 = offsets + 1;

// Verify keys are sorted next to each other
const auto count = static_cast<std::size_t>(
thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>()));
REQUIRE(count <= sequence_length * num_segments);

// // Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

// Copy the unique keys and indexes to host memory
c2h::host_vector<key_t> h_unique_keys_out{unique_keys_out};
c2h::host_vector<std::size_t> h_unique_indexes_out{unique_indexes_out};

// Verify keys are sorted using prior histogram computation
std::size_t uniques_index = 0;
std::size_t current_offset = 0;
for (std::size_t seg_index = 0; seg_index < num_segments; ++seg_index)
{
const auto segment_offset = offsets[seg_index];
const auto segment_end = offsets_plus_1[seg_index];
const auto segment_histogram = compute_histogram_of_series(segment_offset, segment_end);
for (std::size_t i = 0; i < sequence_length; i++)
{
if (segment_histogram[i] != 0)
{
CAPTURE(seg_index, i, uniques_index, current_offset, count);
auto const next_end =
(uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1];
REQUIRE(h_unique_keys_out[uniques_index] == i);
REQUIRE(next_end - h_unique_indexes_out[uniques_index] == segment_histogram[i]);
current_offset += segment_histogram[i];
uniques_index++;
}
}
}
}
};

template <typename T>
struct unwrap_value_t_impl
{
Expand Down
Loading

0 comments on commit 64a419a

Please sign in to comment.