From 8b9f0b0b826c7a2bbcf8a712ffa23402c39b25bd Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 1 Nov 2022 23:50:40 +0400 Subject: [PATCH] Support reduction for more than 2^31 items --- cub/device/device_reduce.cuh | 67 +++++++++++++++++++++++------------- test/test_device_reduce.cu | 46 +++++++++++++++++++++---- 2 files changed, 84 insertions(+), 29 deletions(-) diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 3774fdbb7c..32e0cd3e26 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -158,6 +159,8 @@ struct DeviceReduce * **[inferred]** Data element type that is convertible to the `value` type * of `InputIteratorT` * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -188,18 +191,19 @@ struct DeviceReduce template + typename T, + typename NumItemsT> CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, ReductionOpT reduction_op, T init, cudaStream_t stream = 0) { // Signed integer type for global offsets - using OffsetT = int; + using OffsetT = typename detail::ChooseOffsetT::Type; return DispatchReduce(num_items), reduction_op, init, stream); @@ -303,6 +307,8 @@ struct DeviceReduce * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -324,16 +330,18 @@ struct DeviceReduce * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, - cudaStream_t stream = 0) + NumItemsT num_items, + cudaStream_t stream = 0) { // Signed integer type for global offsets - using OffsetT = int; + using OffsetT = typename detail::ChooseOffsetT::Type; // The output value type using OutputT = @@ -350,7 +358,7 @@ struct DeviceReduce temp_storage_bytes, d_in, d_out, - num_items, + static_cast(num_items), cub::Sum(), InitT{}, // zero-initialize stream); @@ -429,6 +437,8 @@ struct DeviceReduce * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -450,16 +460,18 @@ struct DeviceReduce * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets - using OffsetT = int; + using OffsetT = typename detail::ChooseOffsetT::Type; // The input value type using InputT = cub::detail::value_t; @@ -474,7 +486,7 @@ struct DeviceReduce temp_storage_bytes, d_in, d_out, - num_items, + static_cast(num_items), cub::Min(), // replace with // std::numeric_limits::max() when @@ -583,7 +595,8 @@ struct DeviceReduce * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -707,6 +720,8 @@ struct DeviceReduce * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -728,16 +743,18 @@ struct DeviceReduce * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets - using OffsetT = int; + using OffsetT = typename detail::ChooseOffsetT::Type; // The input value type using InputT = cub::detail::value_t; @@ -752,7 +769,7 @@ struct DeviceReduce temp_storage_bytes, d_in, d_out, - num_items, + static_cast(num_items), cub::Max(), // replace with // std::numeric_limits::lowest() @@ -863,7 +880,8 @@ struct DeviceReduce * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template + template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -1054,6 +1072,8 @@ struct DeviceReduce * **[inferred]*8 Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -1095,7 +1115,8 @@ struct DeviceReduce typename ValuesInputIteratorT, typename AggregatesOutputIteratorT, typename NumRunsOutputIteratorT, - typename ReductionOpT> + typename ReductionOpT, + typename NumItemsT> CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, @@ -1105,11 +1126,11 @@ struct DeviceReduce AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets - using OffsetT = int; + using OffsetT = typename detail::ChooseOffsetT::Type; // FlagT iterator type (not used) @@ -1134,7 +1155,7 @@ struct DeviceReduce d_num_runs_out, EqualityOp(), reduction_op, - num_items, + static_cast(num_items), stream); } diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 1042cf2a0c..b3df906d12 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -33,25 +33,25 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include #include #include #include #include #include +#include +#include #include +#include +#include #include -#include - -#include "test_util.h" - #include #include #include +#include "test_util.h" +#include using namespace cub; @@ -1333,6 +1333,39 @@ __global__ void InitializeTestAccumulatorTypes(int num_items, } } +template +void TestBigIndicesHelper(int magnitude) +{ + const std::size_t num_items = 1ll << magnitude; + thrust::constant_iterator const_iter(T{1}); + thrust::device_vector out(1); + std::size_t* d_out = thrust::raw_pointer_cast(out.data()); + + std::uint8_t *d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + CubDebugExit( + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items)); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + CubDebugExit( + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items)); + std::size_t result = out[0]; + + AssertEquals(result, num_items); +} + +template +void TestBigIndices() +{ + TestBigIndicesHelper(30); + TestBigIndicesHelper(31); + TestBigIndicesHelper(32); + TestBigIndicesHelper(33); +} + void TestAccumulatorTypes() { const int num_items = 2 * 1024 * 1024; @@ -1491,6 +1524,7 @@ int main(int argc, char** argv) TestType(max_items, max_segments); TestAccumulatorTypes(); + TestBigIndices(); #endif printf("\n"); return 0;