diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 67f2dd176ac..65d5eff956b 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -185,7 +185,7 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_include_directories(${config_c2h_target} PUBLIC "${CUB_SOURCE_DIR}/test") cub_clone_target_properties(${config_c2h_target} ${cub_target}) cub_configure_cuda_target(${config_c2h_target} RDC ${cdp_val}) - target_link_libraries(${config_c2h_target} PRIVATE ${cub_target} + target_link_libraries(${config_c2h_target} PRIVATE ${cub_target} PUBLIC CUDA::nvrtc CUDA::cuda_driver) if (CUB_C2H_ENABLE_CURAND) target_link_libraries(${config_c2h_target} PRIVATE CUDA::curand) @@ -283,8 +283,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) set_target_properties(${test_target} PROPERTIES EXCLUDE_FROM_ALL true EXCLUDE_FROM_DEFAULT_BUILD true) add_test(NAME ${test_target} - COMMAND ${CMAKE_COMMAND} --build "${CMAKE_BINARY_DIR}" - --target ${test_target} + COMMAND ${CMAKE_COMMAND} --build "${CMAKE_BINARY_DIR}" + --target ${test_target} --config $) string(REGEX MATCH "err_([0-9]+)" MATCH_RESULT "${test_name}") file(READ ${test_src} test_content) diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu index 67bf81e558c..5c06ac1bd42 100644 --- a/cub/test/c2h/generators.cu +++ b/cub/test/c2h/generators.cu @@ -28,6 +28,7 @@ #define C2H_EXPORTS #include +#include #include #include @@ -127,12 +128,13 @@ struct random_to_item_t template struct random_to_item_t { - float m_min; - float m_max; + using storage_t = cub::detail::conditional_t<(sizeof(T) > 4), double, float>; + storage_t m_min; + storage_t m_max; __host__ __device__ random_to_item_t(T min, T max) - : m_min(static_cast(min)) - , m_max(static_cast(max)) + : m_min(static_cast(min)) + , m_max(static_cast(max)) {} __device__ T operator()(float random_value) @@ -526,6 +528,9 @@ INSTANTIATE(__nv_fp8_e4m3); INSTANTIATE(float); INSTANTIATE(double); +INSTANTIATE(bool); +INSTANTIATE(char); + #ifdef TEST_HALF_T INSTANTIATE(half_t); #endif diff --git a/cub/test/c2h/utility.cuh b/cub/test/c2h/utility.cuh new file mode 100644 index 00000000000..51eae19e7db --- /dev/null +++ b/cub/test/c2h/utility.cuh @@ -0,0 +1,49 @@ +/****************************************************************************** + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +namespace c2h +{ + +/** + * Return a value of type `T0` with the same bitwise representation of `in`. + * Types `To` and `From` must be the same size. + */ +template +__host__ __device__ +To bit_cast(const From& in) +{ + static_assert(sizeof(To) == sizeof(From), "Types must be same size."); + To out; + memcpy(&out, &in, sizeof(To)); + return out; +} + +} diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh new file mode 100644 index 00000000000..7fa775c6ba5 --- /dev/null +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "c2h/utility.cuh" +#include "catch2_test_helper.h" + +// The launchers defined in catch2_test_launch_helper.h do not support +// passing objects by reference since the device-launch tests cannot +// pass references to a __global__ function. The DoubleBuffer object +// must be passed by reference to the radix sort APIs so that the selector +// can be updated appropriately for the caller. This wrapper allows the +// selector to be updated in a way that's compatible with the launch helpers. +// Call initialize() before using to allocate temporary memory, and finalize() +// when finished to release. +struct double_buffer_sort_t +{ +private: + bool m_is_descending; + int* m_selector; + +public: + explicit double_buffer_sort_t(bool is_descending) + : m_is_descending(is_descending), + m_selector(nullptr) + { + } + + void initialize() + { + REQUIRE(cudaSuccess == cudaMallocHost(&m_selector, sizeof(int))); + } + + void finalize() + { + REQUIRE(cudaSuccess == cudaFreeHost(m_selector)); + m_selector = nullptr; + } + + int selector() const { return *m_selector;} + + template + CUB_RUNTIME_FUNCTION cudaError_t + operator()(std::uint8_t* d_temp_storage, std::size_t& temp_storage_bytes, cub::DoubleBuffer keys, As... as) + { + const cudaError_t status = + m_is_descending ? cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys, as...) + : cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys, as...); + + *m_selector = keys.selector; + return status; + } + + template + CUB_RUNTIME_FUNCTION cudaError_t operator()( + std::uint8_t* d_temp_storage, + std::size_t& temp_storage_bytes, + cub::DoubleBuffer keys, + cub::DoubleBuffer values, + As... as) + { + const cudaError_t status = + m_is_descending ? cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, keys, values, as...) + : cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys, values, as...); + + *m_selector = keys.selector; + return status; + } +}; + +// Helpers to assist with specifying default args to DeviceRadixSort API: +template +constexpr int begin_bit() +{ + return 0; +} + +template +constexpr int end_bit() +{ + return static_cast(sizeof(T) * CHAR_BIT); +} + +template +thrust::host_vector +get_striped_keys(const thrust::host_vector &h_keys, + int begin_bit, + int end_bit) +{ + thrust::host_vector h_striped_keys(h_keys); + KeyT *h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data()); + + using traits_t = cub::Traits; + using bit_ordered_t = typename traits_t::UnsignedBits; + + const int num_bits = end_bit - begin_bit; + + for (std::size_t i = 0; i < h_keys.size(); i++) + { + bit_ordered_t key = c2h::bit_cast(h_keys[i]); + + CUB_IF_CONSTEXPR(traits_t::CATEGORY == cub::FLOATING_POINT) + { + const bit_ordered_t negative_zero = bit_ordered_t(1) << bit_ordered_t(sizeof(bit_ordered_t) * 8 - 1); + + if (key == negative_zero) + { + key = 0; + } + } + + key = traits_t::TwiddleIn(key); + + if ((begin_bit > 0) || (end_bit < static_cast(sizeof(KeyT) * 8))) + { + key &= ((bit_ordered_t{1} << num_bits) - 1) << begin_bit; + } + + // striped keys are used to compare bit ordered representation of keys, + // so we do not twiddle-out the key here: + // key = traits_t::TwiddleOut(key); + + memcpy(h_striped_keys_data + i, &key, sizeof(KeyT)); +} + + return h_striped_keys; +} + +template +struct indirect_binary_comparator_t +{ + const T* h_ptr{}; + bool is_descending{}; + + indirect_binary_comparator_t(const T* h_ptr, bool is_descending) + : h_ptr(h_ptr) + , is_descending(is_descending) + {} + + bool operator()(std::size_t a, std::size_t b) + { + if (is_descending) + { + return h_ptr[a] > h_ptr[b]; + } + + return h_ptr[a] < h_ptr[b]; + } +}; + +template +thrust::host_vector +get_permutation(const thrust::host_vector &h_keys, + bool is_descending, + int begin_bit, + int end_bit) +{ + thrust::host_vector h_striped_keys = + get_striped_keys(h_keys, begin_bit, end_bit); + + thrust::host_vector h_permutation(h_keys.size()); + thrust::sequence(h_permutation.begin(), h_permutation.end()); + + using traits_t = cub::Traits; + using bit_ordered_t = typename traits_t::UnsignedBits; + + auto bit_ordered_striped_keys = + reinterpret_cast(thrust::raw_pointer_cast(h_striped_keys.data())); + + std::stable_sort(h_permutation.begin(), + h_permutation.end(), + indirect_binary_comparator_t{bit_ordered_striped_keys, is_descending}); + + return h_permutation; +} + +template +thrust::host_vector +radix_sort_reference(const thrust::device_vector &d_keys, + bool is_descending, + int begin_bit = 0, + int end_bit = static_cast(sizeof(KeyT) * CHAR_BIT)) +{ + thrust::host_vector h_keys(d_keys); + thrust::host_vector h_permutation = + get_permutation(h_keys, is_descending, begin_bit, end_bit); + thrust::host_vector result(d_keys.size()); + thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin()); + + return result; +} + +template +std::pair, thrust::host_vector> +radix_sort_reference(const thrust::device_vector &d_keys, + const thrust::device_vector &d_values, + bool is_descending, + int begin_bit = 0, + int end_bit = static_cast(sizeof(KeyT) * CHAR_BIT)) +{ + std::pair, thrust::host_vector> result; + result.first.resize(d_keys.size()); + result.second.resize(d_keys.size()); + + thrust::host_vector h_keys(d_keys); + thrust::host_vector h_permutation = + get_permutation(h_keys, is_descending, begin_bit, end_bit); + + thrust::host_vector h_values(d_values); + thrust::gather(h_permutation.cbegin(), + h_permutation.cend(), + thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()), + thrust::make_zip_iterator(result.first.begin(), result.second.begin())); + + return result; +} diff --git a/cub/test/catch2_test_block_radix_sort.cuh b/cub/test/catch2_test_block_radix_sort.cuh index 6b79457a434..8866e43a527 100644 --- a/cub/test/catch2_test_block_radix_sort.cuh +++ b/cub/test/catch2_test_block_radix_sort.cuh @@ -28,13 +28,9 @@ #pragma once #include -#include - -#include -#include -#include #include "catch2_test_helper.h" +#include "catch2_radix_sort_helper.cuh" template __global__ void kernel( - ActionT action, - InputIteratorT input, + ActionT action, + InputIteratorT input, OutputIteratorT output, int begin_bit, int end_bit, @@ -149,10 +145,10 @@ template __global__ void kernel( - ActionT action, - InputKeyIteratorT input_keys, + ActionT action, + InputKeyIteratorT input_keys, InputValueIteratorT input_values, - OutputKeyIteratorT output_keys, + OutputKeyIteratorT output_keys, OutputValueIteratorT output_values, int begin_bit, int end_bit, @@ -198,7 +194,7 @@ __global__ void kernel( output_values[threadIdx.x + ThreadsInBlock * i] = values[i]; } } - else + else { action(block_radix_sort, keys, @@ -357,140 +353,3 @@ struct descending_sort_pairs_op_t end_bit); } }; - -template -thrust::host_vector -get_striped_keys(const thrust::host_vector &h_keys, - int begin_bit, - int end_bit) -{ - thrust::host_vector h_striped_keys(h_keys); - KeyT *h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data()); - - using traits_t = cub::Traits; - using bit_ordered_t = typename traits_t::UnsignedBits; - - const int num_bits = end_bit - begin_bit; - - for (std::size_t i = 0; i < h_keys.size(); i++) - { - bit_ordered_t key = reinterpret_cast(h_keys[i]); - - CUB_IF_CONSTEXPR(traits_t::CATEGORY == cub::FLOATING_POINT) - { - const bit_ordered_t negative_zero = bit_ordered_t(1) << bit_ordered_t(sizeof(bit_ordered_t) * 8 - 1); - - if (key == negative_zero) - { - key = 0; - } - } - - key = traits_t::TwiddleIn(key); - - if ((begin_bit > 0) || (end_bit < static_cast(sizeof(KeyT) * 8))) - { - unsigned long long base = 0; - memcpy(&base, &key, sizeof(bit_ordered_t)); - base &= ((1ULL << num_bits) - 1) << begin_bit; - memcpy(&key, &base, sizeof(bit_ordered_t)); - } - - // striped keys are used to compare bit ordered representation of keys, - // so we do not twiddle-out the key here: - // key = traits_t::TwiddleOut(key); - - memcpy(h_striped_keys_data + i, &key, sizeof(KeyT)); -} - - return h_striped_keys; -} - -template -struct indirect_binary_comparator_t -{ - const T* h_ptr{}; - bool is_descending{}; - - indirect_binary_comparator_t(const T* h_ptr, bool is_descending) - : h_ptr(h_ptr) - , is_descending(is_descending) - {} - - bool operator()(std::size_t a, std::size_t b) - { - if (is_descending) - { - return h_ptr[a] > h_ptr[b]; - } - - return h_ptr[a] < h_ptr[b]; - } -}; - -template -thrust::host_vector -get_permutation(const thrust::host_vector &h_keys, - bool is_descending, - int begin_bit, - int end_bit) -{ - thrust::host_vector h_striped_keys = - get_striped_keys(h_keys, begin_bit, end_bit); - - thrust::host_vector h_permutation(h_keys.size()); - thrust::sequence(h_permutation.begin(), h_permutation.end()); - - using traits_t = cub::Traits; - using bit_ordered_t = typename traits_t::UnsignedBits; - - auto bit_ordered_striped_keys = - reinterpret_cast(thrust::raw_pointer_cast(h_striped_keys.data())); - - std::stable_sort(h_permutation.begin(), - h_permutation.end(), - indirect_binary_comparator_t{bit_ordered_striped_keys, is_descending}); - - return h_permutation; -} - -template -thrust::host_vector -radix_sort_reference(const thrust::device_vector &d_keys, - bool is_descending, - int begin_bit, - int end_bit) -{ - thrust::host_vector h_keys(d_keys); - thrust::host_vector h_permutation = - get_permutation(h_keys, is_descending, begin_bit, end_bit); - thrust::host_vector result(d_keys.size()); - thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin()); - - return result; -} - -template -std::pair, thrust::host_vector> -radix_sort_reference(const thrust::device_vector &d_keys, - const thrust::device_vector &d_values, - bool is_descending, - int begin_bit, - int end_bit) -{ - std::pair, thrust::host_vector> result; - result.first.resize(d_keys.size()); - result.second.resize(d_keys.size()); - - thrust::host_vector h_keys(d_keys); - thrust::host_vector h_permutation = - get_permutation(h_keys, is_descending, begin_bit, end_bit); - - thrust::host_vector h_values(d_values); - thrust::gather(h_permutation.cbegin(), - h_permutation.cend(), - thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()), - thrust::make_zip_iterator(result.first.begin(), result.second.begin())); - - return result; -} diff --git a/cub/test/catch2_test_device_radix_sort_custom.cu b/cub/test/catch2_test_device_radix_sort_custom.cu index 44b657adc58..b951a8b9c42 100644 --- a/cub/test/catch2_test_device_radix_sort_custom.cu +++ b/cub/test/catch2_test_device_radix_sort_custom.cu @@ -39,6 +39,7 @@ #include #include +#include "catch2_radix_sort_helper.cuh" #include "catch2_test_launch_helper.h" #include "catch2_test_helper.h" #include "cub/util_type.cuh" @@ -266,55 +267,11 @@ CUB_TEST("Device radix sort can sort pairs with custom i128_t keys", "[radix][so REQUIRE(reference.second == out_values); } -struct double_buffer_sort_t -{ - bool is_descending; - int *selector; - - template - CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage, - std::size_t &temp_storage_bytes, - cub::DoubleBuffer keys, - As... as) - { - const cudaError_t status = - is_descending - ? cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys, as...) - : cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys, as...); - - *selector = keys.selector; - return status; - } - - template - CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage, - std::size_t &temp_storage_bytes, - cub::DoubleBuffer keys, - cub::DoubleBuffer values, - As... as) - { - const cudaError_t status = - is_descending - ? cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys, - values, - as...) - : cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys, values, as...); - - *selector = keys.selector; - return status; - } -}; - CUB_TEST("Device radix sort works with custom i128_t (db)", "[radix][sort][device]") { constexpr int max_items = 1 << 18; const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items))); - int *selector = nullptr; - cudaMallocHost(&selector, sizeof(int)); - thrust::device_vector keys_1(num_items); thrust::device_vector keys_2(num_items); c2h::gen(CUB_SEED(2), keys_1); @@ -326,10 +283,13 @@ CUB_TEST("Device radix sort works with custom i128_t (db)", "[radix][sort][devic const bool is_descending = GENERATE(false, true); auto reference_keys = reference_sort_keys(keys_1, is_descending, 0, 128); - launch(double_buffer_sort_t{is_descending, selector}, keys, num_items, pair_decomposer_t{}); - keys.selector = *selector; - cudaFreeHost(selector); + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, keys, num_items, pair_decomposer_t{}); + + keys.selector = action.selector(); + action.finalize(); thrust::device_vector &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2; @@ -341,9 +301,6 @@ CUB_TEST("Device radix sort works with custom i128_t keys (db)", "[radix][sort][ constexpr int max_items = 1 << 18; const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items))); - int *selector = nullptr; - cudaMallocHost(&selector, sizeof(int)); - thrust::device_vector keys_1(num_items); thrust::device_vector keys_2(num_items); c2h::gen(CUB_SEED(2), keys_1); @@ -364,15 +321,18 @@ CUB_TEST("Device radix sort works with custom i128_t keys (db)", "[radix][sort][ const bool is_descending = GENERATE(false, true); auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, 0, 128); - launch(double_buffer_sort_t{is_descending, selector}, + + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, keys, values, num_items, pair_decomposer_t{}); - keys.selector = *selector; - values.selector = *selector; - cudaFreeHost(selector); + keys.selector = action.selector(); + values.selector = action.selector(); + action.finalize(); thrust::device_vector &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2; thrust::device_vector &out_values = values.Current() == d_values_1 ? values_1 : values_2; @@ -470,9 +430,6 @@ CUB_TEST("Device radix sort works with bits of custom i128_t (db)", "[radix][sor constexpr int max_items = 1 << 18; const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items))); - int *selector = nullptr; - cudaMallocHost(&selector, sizeof(int)); - thrust::device_vector keys_1(num_items); thrust::device_vector keys_2(num_items); c2h::gen(CUB_SEED(2), keys_1); @@ -487,15 +444,18 @@ CUB_TEST("Device radix sort works with bits of custom i128_t (db)", "[radix][sor const bool is_descending = GENERATE(false, true); auto reference_keys = reference_sort_keys(keys_1, is_descending, begin_bit, end_bit); - launch(double_buffer_sort_t{is_descending, selector}, + + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, keys, num_items, pair_decomposer_t{}, begin_bit, end_bit); - keys.selector = *selector; - cudaFreeHost(selector); + keys.selector = action.selector(); + action.finalize(); thrust::device_vector &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2; @@ -532,7 +492,10 @@ CUB_TEST("Device radix sort works with bits of custom i128_t keys (db)", "[radix const bool is_descending = GENERATE(false, true); auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, begin_bit, end_bit); - launch(double_buffer_sort_t{is_descending, selector}, + + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, keys, values, num_items, @@ -540,9 +503,9 @@ CUB_TEST("Device radix sort works with bits of custom i128_t keys (db)", "[radix begin_bit, end_bit); - keys.selector = *selector; - values.selector = *selector; - cudaFreeHost(selector); + keys.selector = action.selector(); + values.selector = action.selector(); + action.finalize(); thrust::device_vector &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2; thrust::device_vector &out_values = values.Current() == d_values_1 ? values_1 : values_2; @@ -578,8 +541,8 @@ struct decomposer_t }; // example-end custom-type -static __host__ std::ostream &operator<<(std::ostream &os, const custom_t &self) -{ +static __host__ std::ostream &operator<<(std::ostream &os, const custom_t &self) +{ return os << "{ " << self.f << ", " << self.lli << " }"; } @@ -1100,14 +1063,14 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000 // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000 - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> // // The bit subrange `[60, 68)` specifies differentiating key bits: // // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> thrust::device_vector out(num_items); @@ -1156,7 +1119,7 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so constexpr int num_items = 2; thrust::device_vector in = { {42.4f, 1ll << 60}, - {24.2f, 1ll << 61} + {24.2f, 1ll << 61} }; constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60 @@ -1167,14 +1130,14 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000 // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000 - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> // // The bit subrange `[60, 68)` specifies differentiating key bits: // // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> thrust::device_vector out(num_items); @@ -1236,7 +1199,7 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000 // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000 - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> // // The bit subrange `[60, 68)` specifies differentiating key bits: // @@ -1302,7 +1265,7 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so constexpr int num_items = 2; thrust::device_vector keys_in = { {42.4f, 1ll << 60}, - {24.2f, 1ll << 61} + {24.2f, 1ll << 61} }; thrust::device_vector vals_in = { 1, 0 }; @@ -1315,14 +1278,14 @@ CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][so // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000 // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000 - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> // // The bit subrange `[60, 68)` specifies differentiating key bits: // // <------------- fp32 -----------> <------ int64 ------> // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx - // <----------- higher bits / lower bits -----------> + // <----------- higher bits / lower bits -----------> thrust::device_vector keys_out(num_items); thrust::device_vector vals_out(num_items); diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu new file mode 100644 index 00000000000..2c129ab4daf --- /dev/null +++ b/cub/test/catch2_test_device_radix_sort_keys.cu @@ -0,0 +1,464 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "catch2_radix_sort_helper.cuh" +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeys, sort_keys); +DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeysDescending, sort_keys_descending); + +// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64 + +// TODO: +// - int128 +// - uint128 + +// The unsigned integer for the given byte count should be first: +#if TEST_KEY_BITS == 8 +using key_types = c2h::type_list; +using bit_window_key_types = c2h::type_list; +#define NO_FP_KEY_TYPES +#elif TEST_KEY_BITS == 16 +// clang-format off +using key_types = c2h::type_list< + cuda::std::uint16_t + , cuda::std::int16_t +#ifdef TEST_HALF_T + , half_t +#endif +#ifdef TEST_BF_T + , bfloat16_t +#endif + >; +// clang-format on +using bit_window_key_types = c2h::type_list; +#define NO_FP_KEY_TYPES +#elif TEST_KEY_BITS == 32 +using key_types = c2h::type_list; +using bit_window_key_types = c2h::type_list; +using fp_key_types = c2h::type_list; +#elif TEST_KEY_BITS == 64 +using key_types = c2h::type_list; +using bit_window_key_types = c2h::type_list; +using fp_key_types = c2h::type_list; +#endif + +// Used for tests that just need a single type for testing: +using single_key_type = c2h::type_list>; + +// Index types used for NumItemsT testing +using num_items_types = + c2h::type_list; + +CUB_TEST("DeviceRadixSort::SortKeys: basic testing", "[keys][radix][sort][device]", key_types) +{ + using key_t = c2h::get<0, TestType>; + + constexpr std::size_t min_num_items = 1 << 5; + constexpr std::size_t max_num_items = 1 << 20; + const std::size_t num_items = GENERATE_COPY(std::size_t{0}, std::size_t{1}, take(8, random(min_num_items, max_num_items))); + + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + const int num_key_seeds = 3; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + REQUIRE(ref_keys == out_keys); +} + +CUB_TEST("DeviceRadixSort::SortKeys: bit windows", "[keys][radix][sort][device]", bit_window_key_types) +{ + using key_t = c2h::get<0, TestType>; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + + constexpr int num_bits = sizeof(key_t) * CHAR_BIT; + // Explicitly use values<>({}) to workaround bug catchorg/Catch2#2040: + const int begin_bit = GENERATE_COPY(values({0, num_bits / 3, 3 * num_bits / 4, num_bits})); + const int end_bit = GENERATE_COPY(values({0, num_bits / 3, 3 * num_bits / 4, num_bits})); + if (end_bit < begin_bit || (begin_bit == 0 && end_bit == num_bits)) + { + // SKIP(); Not available until Catch2 3.3.0 + return; + } + + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending, begin_bit, end_bit); + + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit, + end_bit); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit, + end_bit); + } + + REQUIRE(ref_keys == out_keys); +} + +#ifndef NO_FP_KEY_TYPES + +CUB_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sort][device]", fp_key_types) +{ + using key_t = c2h::get<0, TestType>; + using bits_t = typename cub::Traits::UnsignedBits; + + constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT; + const key_t positive_zero = c2h::bit_cast(bits_t(0)); + const key_t negative_zero = c2h::bit_cast(bits_t(1) << (num_bits - 1)); + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + // Sprinkle some positive and negative zeros randomly throughout the keys: + { + const size_t num_indices = num_items / 128; + thrust::device_vector indices(num_indices); + for (int i = 0; i < 2; ++i) + { + c2h::gen(CUB_SEED(1), indices, std::size_t(0), num_items); + auto begin = thrust::make_constant_iterator(i == 0 ? positive_zero : negative_zero); + auto end = begin + num_indices; + thrust::scatter(begin, end, indices.cbegin(), in_keys.begin()); + } + } + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + // Perform a bitwise comparison to ensure that 0 != -0: + REQUIRE_BITWISE_EQ(ref_keys, out_keys); +} + +CUB_TEST("DeviceRadixSort::SortKeys: NaN handling", "[keys][radix][sort][device]", fp_key_types) +{ + using key_t = c2h::get<0, TestType>; + using limits_t = cuda::std::numeric_limits; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + // Sprinkle some NaNs randomly throughout the keys: + { + const size_t num_indices = num_items / 128; + thrust::device_vector indices(num_indices); + bool has_nans = false; + for (int i = 0; i < 2; ++i) + { + const bool supported = i == 0 ? limits_t::has_signaling_NaN : limits_t::has_quiet_NaN; + const key_t nan_val = i == 0 ? limits_t::signaling_NaN() : limits_t::quiet_NaN(); + + if (supported) + { + has_nans = true; + c2h::gen(CUB_SEED(1), indices, std::size_t(0), num_items); + auto begin = thrust::make_constant_iterator(nan_val); + auto end = begin + num_indices; + thrust::scatter(begin, end, indices.cbegin(), in_keys.begin()); + } + } + if (!has_nans) + { + // SKIP(); Not available until Catch2 3.3.0 + return; + } + } + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + REQUIRE_EQ_WITH_NAN_MATCHING(ref_keys, out_keys); +} + +#endif // !NO_FP_KEY_TYPES + +CUB_TEST("DeviceRadixSort::SortKeys: entropy reduction", "[keys][radix][sort][device]", single_key_type) +{ + using key_t = c2h::get<0, TestType>; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + thrust::device_vector in_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + // Repeatedly bitwise-and random keys together. This increases the likelyhood + // of duplicate keys. + const int entropy_reduction = GENERATE(1, 3, 9, 15); + { + thrust::device_vector tmp(num_items); + for (int i = 0; i < entropy_reduction; ++i) + { + c2h::gen(CUB_SEED(1), tmp); + thrust::transform(in_keys.cbegin(), in_keys.cend(), tmp.cbegin(), in_keys.begin(), thrust::bit_and{}); + } + } + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + thrust::device_vector out_keys(num_items); + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + REQUIRE(ref_keys == out_keys); +} + +CUB_TEST("DeviceRadixSort::SortKeys: uniform values", "[keys][radix][sort][device]", key_types) +{ + using key_t = c2h::get<0, TestType>; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + thrust::device_vector in_keys(num_items, key_t(4)); + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + thrust::device_vector out_keys(num_items); + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + REQUIRE(ref_keys == out_keys); +} + +CUB_TEST("DeviceRadixSort::SortKeys: NumItemsT", "[keys][radix][sort][device]", single_key_type, num_items_types) +{ + using key_t = c2h::get<0, TestType>; + using num_items_t = c2h::get<1, TestType>; + + constexpr num_items_t min_num_items = 1 << 5; + constexpr num_items_t max_num_items = 1 << 20; + const num_items_t num_items = + GENERATE_COPY(num_items_t{0}, num_items_t{1}, take(8, random(min_num_items, max_num_items))); + + thrust::device_vector in_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + thrust::device_vector out_keys(num_items); + if (is_descending) + { + sort_keys_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_keys(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + num_items, + begin_bit(), + end_bit()); + } + + REQUIRE(ref_keys == out_keys); +} + +CUB_TEST("DeviceRadixSort::SortKeys: DoubleBuffer API", "[keys][radix][sort][device]", single_key_type) +{ + using key_t = c2h::get<0, TestType>; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + thrust::device_vector in_keys(num_items); + + const int num_key_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + + const bool is_descending = GENERATE(false, true); + + auto ref_keys = radix_sort_reference(in_keys, is_descending); + + thrust::device_vector out_keys(num_items); + cub::DoubleBuffer key_buffer( + thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data())); + + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, + key_buffer, + num_items, + begin_bit(), + end_bit()); + + key_buffer.selector = action.selector(); + action.finalize(); + + auto& keys = key_buffer.selector == 0 ? in_keys : out_keys; + + REQUIRE(ref_keys == keys); +} diff --git a/cub/test/catch2_test_device_radix_sort_pairs.cu b/cub/test/catch2_test_device_radix_sort_pairs.cu new file mode 100644 index 00000000000..fad693dd3c2 --- /dev/null +++ b/cub/test/catch2_test_device_radix_sort_pairs.cu @@ -0,0 +1,157 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "catch2_radix_sort_helper.cuh" +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairs, sort_pairs); +DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairsDescending, sort_pairs_descending); + +using custom_value_t = c2h::custom_type_t; +using value_types = c2h::type_list; + +using num_items_types = + c2h::type_list; + +CUB_TEST("DeviceRadixSort::SortPairs: Basic testing", "[pairs][radix][sort][device]", value_types, num_items_types) +{ + using key_t = cuda::std::uint32_t; + using value_t = c2h::get<0, TestType>; + using num_items_t = c2h::get<1, TestType>; + + constexpr num_items_t min_num_items = 1 << 5; + constexpr num_items_t max_num_items = 1 << 20; + const num_items_t num_items = + GENERATE_COPY(num_items_t{0}, num_items_t{1}, take(5, random(min_num_items, max_num_items))); + + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + thrust::device_vector in_values(num_items); + thrust::device_vector out_values(num_items); + + const int num_key_seeds = 1; + const int num_value_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + + const bool is_descending = GENERATE(false, true); + + if (is_descending) + { + sort_pairs_descending( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + thrust::raw_pointer_cast(in_values.data()), + thrust::raw_pointer_cast(out_values.data()), + num_items, + begin_bit(), + end_bit()); + } + else + { + sort_pairs(thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + thrust::raw_pointer_cast(in_values.data()), + thrust::raw_pointer_cast(out_values.data()), + num_items, + begin_bit(), + end_bit()); + } + + auto refs = radix_sort_reference(in_keys, in_values, is_descending); + auto &ref_keys = refs.first; + auto &ref_values = refs.second; + + REQUIRE(ref_keys == out_keys); + REQUIRE(ref_values == out_values); +} + +CUB_TEST("DeviceRadixSort::SortPairs: DoubleBuffer API", "[pairs][radix][sort][device]", value_types) +{ + using key_t = cuda::std::uint32_t; + using value_t = c2h::get<0, TestType>; + + constexpr std::size_t max_num_items = 1 << 18; + const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); + + thrust::device_vector in_keys(num_items); + thrust::device_vector out_keys(num_items); + + thrust::device_vector in_values(num_items); + thrust::device_vector out_values(num_items); + + const int num_key_seeds = 1; + const int num_value_seeds = 1; + c2h::gen(CUB_SEED(num_key_seeds), in_keys); + c2h::gen(CUB_SEED(num_value_seeds), in_values); + + const bool is_descending = GENERATE(false, true); + + cub::DoubleBuffer key_buffer( + thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data())); + cub::DoubleBuffer value_buffer( + thrust::raw_pointer_cast(in_values.data()), thrust::raw_pointer_cast(out_values.data())); + + double_buffer_sort_t action(is_descending); + action.initialize(); + launch(action, + key_buffer, + value_buffer, + num_items, + begin_bit(), + end_bit()); + + key_buffer.selector = action.selector(); + value_buffer.selector = action.selector(); + action.finalize(); + + auto refs = radix_sort_reference(in_keys, in_values, is_descending); + auto &ref_keys = refs.first; + auto &ref_values = refs.second; + + auto& keys = key_buffer.selector == 0 ? in_keys : out_keys; + auto& values = value_buffer.selector == 0 ? in_values : out_values; + + REQUIRE(ref_keys == keys); + REQUIRE(ref_values == values); +} diff --git a/cub/test/catch2_test_helper.h b/cub/test/catch2_test_helper.h index fee77af58ab..b52a038f97d 100644 --- a/cub/test/catch2_test_helper.h +++ b/cub/test/catch2_test_helper.h @@ -38,9 +38,13 @@ #include #include "test_util_vec.h" +#include "c2h/utility.cuh" + #include "catch2_main.cuh" #include "test_warning_suppression.cuh" +#include + #ifndef VAR_IDX #define VAR_IDX 0 #endif @@ -75,7 +79,7 @@ using second = metal::second

; template using iota = metal::iota, metal::number, metal::number>; -} // namespace c2h +} // namespace c2h namespace detail { @@ -105,6 +109,80 @@ namespace detail REQUIRE_THAT(vec_ref, Catch::Approx(vec_out)); \ } +namespace detail +{ + // Returns true if values are equal, or both NaN: + struct equal_or_nans + { + template + bool operator()(const T& a, const T& b) const + { + return (cuda::std::isnan(a) && cuda::std::isnan(b)) || a == b; + } + }; + + struct bitwise_equal + { + template + bool operator()(const T&a, const T&b) const + { + using bits_t = typename cub::Traits::UnsignedBits; + bits_t a_bits = c2h::bit_cast(a); + bits_t b_bits = c2h::bit_cast(b); + return a_bits == b_bits; + } + }; + + // Catch2 Matcher that calls `std::equal` with a default-constructable custom predicate + template + struct CustomEqualsRangeMatcher : Catch::MatcherBase + { + CustomEqualsRangeMatcher(Range const& range) + : range{range} + {} + + bool match(Range const& other) const override + { + using std::begin; + using std::end; + + return std::equal(begin(range), end(range), begin(other), Pred{}); + } + + std::string describe() const override + { + return "Equals: " + Catch::rangeToString(range); + } + + private: + Range const& range; + }; + + template + auto NaNEqualsRange(const Range& range) -> CustomEqualsRangeMatcher + { + return CustomEqualsRangeMatcher(range); + } + + template + auto BitwiseEqualsRange(const Range& range) -> CustomEqualsRangeMatcher + { + return CustomEqualsRangeMatcher(range); + } +} // namespace detail + +#define REQUIRE_EQ_WITH_NAN_MATCHING(ref, out) { \ + auto vec_ref = detail::to_vec(ref); \ + auto vec_out = detail::to_vec(out); \ + REQUIRE_THAT(vec_ref, detail::NaNEqualsRange(vec_out)); \ +} + +#define REQUIRE_BITWISE_EQ(ref, out) { \ + auto vec_ref = detail::to_vec(ref); \ + auto vec_out = detail::to_vec(out); \ + REQUIRE_THAT(vec_ref, detail::NaNEqualsRange(vec_out)); \ +} + #include #include diff --git a/cub/test/test_device_radix_sort.cu b/cub/test/test_device_radix_sort.cu index 800fafa6fc2..05437e8ee88 100644 --- a/cub/test/test_device_radix_sort.cu +++ b/cub/test/test_device_radix_sort.cu @@ -1181,7 +1181,7 @@ void Test( int compare = 0; - // If in/out API is used, we are not allowed to overwrite the input. + // If in/out API is used, we are not allowed to overwrite the input. // Let's check that the input buffer is not overwritten by the algorithm. CUB_IF_CONSTEXPR(BACKEND == CUB_NO_OVERWRITE) { @@ -1192,7 +1192,7 @@ void Test( // For small input sizes, temporary storage is not large enough to fit keys. compare = CompareDeviceResults(h_keys, d_input_keys, num_items, true, g_verbose); } - else + else { // If overwrite is not allowed, temporary storage is large enough to fit keys. KeyT* temp_keys = reinterpret_cast(d_temp_storage); @@ -1202,7 +1202,7 @@ void Test( } } - // After the previous check is done, we can safely reuse alternative buffer to store + // After the previous check is done, we can safely reuse alternative buffer to store // the reference results and compare current output. compare |= compare_device_arrays(h_reference_keys, reinterpret_cast(d_keys.Alternate()), @@ -1649,8 +1649,8 @@ void TestSegments( } } -/** - * Test different NumItemsT, i.e. types of num_items +/** + * Test different NumItemsT, i.e. types of num_items */ template void TestNumItems(KeyT *h_keys, std::size_t num_items, int max_segments, bool pre_sorted) @@ -1875,10 +1875,10 @@ void TestUnspecifiedRanges() for (std::size_t sid = 0; sid < max_segments; sid++) { - const int segment_size = + const int segment_size = static_cast(RandomValue(avg_segment_size)); - const bool segment_is_utilized = segment_size > 0 + const bool segment_is_utilized = segment_size > 0 && RandomValue(100) > 60; if (segment_is_utilized) @@ -1926,20 +1926,20 @@ void TestUnspecifiedRanges() { cub::DoubleBuffer keys_buffer( - thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data())); cub::DoubleBuffer values_buffer( - thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data())); std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{nullptr}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, - keys_buffer, values_buffer, - num_items, num_segments, + d_temp_storage, temp_storage_bytes, + keys_buffer, values_buffer, + num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); @@ -1948,9 +1948,9 @@ void TestUnspecifiedRanges() d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, - keys_buffer, values_buffer, - num_items, num_segments, + d_temp_storage, temp_storage_bytes, + keys_buffer, values_buffer, + num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); @@ -1967,7 +1967,7 @@ void TestUnspecifiedRanges() keys.begin() + segment_end, result_keys.begin() + segment_begin); } - + if (values_buffer.selector == 0) { thrust::copy( @@ -1978,7 +1978,7 @@ void TestUnspecifiedRanges() } } - AssertEquals(result_keys, expected_keys); + AssertEquals(result_keys, expected_keys); AssertEquals(result_values, expected_values); thrust::sequence(keys.rbegin(), keys.rend()); @@ -1992,12 +1992,12 @@ void TestUnspecifiedRanges() std::uint8_t *d_temp_storage{}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(result_keys.data()), - thrust::raw_pointer_cast(values.data()), - thrust::raw_pointer_cast(result_values.data()), - num_items, num_segments, + d_temp_storage, temp_storage_bytes, + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(result_keys.data()), + thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(result_values.data()), + num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); @@ -2006,12 +2006,12 @@ void TestUnspecifiedRanges() d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(result_keys.data()), - thrust::raw_pointer_cast(values.data()), - thrust::raw_pointer_cast(result_values.data()), - num_items, num_segments, + d_temp_storage, temp_storage_bytes, + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(result_keys.data()), + thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(result_values.data()), + num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); @@ -2024,12 +2024,12 @@ void TestUnspecifiedRanges() #endif #if TEST_KEY_BYTES == 4 -// Following tests check that new decomposer API doesn't break old API. +// Following tests check that new decomposer API doesn't break old API. // It's disabled because some compilers don't like implicit conversions, which // is required for the test. Once we figure out how to temporarily enable conversion, we can // re-enable the test. #define ENABLING_CONVERSION_IS_FIGURED_OUT 0 -#if ENABLING_CONVERSION_IS_FIGURED_OUT +#if ENABLING_CONVERSION_IS_FIGURED_OUT struct bit_selector { int bit; @@ -2197,7 +2197,7 @@ void device_radix_sort_allows_implicit_conversions_for_bits() device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_lli); device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_bs); } -#endif // ENABLING_CONVERSION_IS_FIGURED_OUT +#endif // ENABLING_CONVERSION_IS_FIGURED_OUT #endif // TEST_KEY_BYTES == 4 //--------------------------------------------------------------------- @@ -2288,7 +2288,7 @@ int main(int argc, char** argv) TestUnspecifiedRanges(); #endif -#if ENABLING_CONVERSION_IS_FIGURED_OUT +#if ENABLING_CONVERSION_IS_FIGURED_OUT device_radix_sort_allows_implicit_conversions_for_bits(); #endif @@ -2308,7 +2308,7 @@ int main(int argc, char** argv) #elif TEST_KEY_BYTES == 16 -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED TestGen<__int128_t, false>(num_items, num_segments); TestGen<__uint128_t, false>(num_items, num_segments); #else diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 4930dbd074f..6f3a9a43aa0 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -70,6 +70,7 @@ * Types `T` and `U` must be the same size. */ template +__host__ __device__ T SafeBitCast(const U& in) { static_assert(sizeof(T) == sizeof(U), "Types must be same size.");