From e41e85a61c25722eb18d8f8708af257e1bf8b267 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 10 Jun 2024 14:17:25 -0700 Subject: [PATCH] Improve warp_scan api test --- cub/cub/warp/warp_scan.cuh | 43 +++------ cub/test/catch2_test_warp_scan_api.cu | 130 ++++++++++++++------------ 2 files changed, 84 insertions(+), 89 deletions(-) diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index 3e46bdc3043..1566911180d 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -501,10 +501,11 @@ public: //! :start-after: example-begin inclusive-warp-scan-init-value //! :end-before: example-end inclusive-warp-scan-init-value //! - //! Suppose the set of input ``thread_data`` across the block of threads is - //! ``{0, 1, 2, 3, 4, ..., 127}``. The corresponding output ``thread_data`` in the first - //! warp would be ``{1, 2, 4, ..., 497}``, the output for the second warp would be - //! ``{33, 66, 100, ..., 1024}``, etc. + //! Suppose the set of input ``thread_data`` in the first warp is + //! ``{0, 1, 2, 3, ..., 31}``, in the second warp is ``{1, 2, 3, 4, ..., 32}`` etc. + //! The corresponding output ``thread_data`` for a max operation in the first + //! warp would be ``{3, 3, 3, 3, ..., 31}``, the output for the second warp would be + //! ``{3, 3, 3, 4, ..., 32}``, etc. //! @endrst //! //! @tparam ScanOp @@ -607,32 +608,18 @@ public: //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! - //! .. code-block:: c++ - //! - //! #include - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize WarpScan for type int - //! typedef cub::WarpScan WarpScan; - //! - //! // Allocate WarpScan shared memory for 4 warps - //! __shared__ typename WarpScan::TempStorage temp_storage[4]; - //! - //! // Obtain one input item per thread - //! int thread_data = ... - //! - //! // Compute inclusive warp-wide prefix max scans - //! int warp_aggregate; - //! int warp_id = threadIdx.x / 32; - //! WarpScan(temp_storage[warp_id]).InclusiveScan( - //! thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-warp-scan-init-value-aggregate + //! :end-before: example-end inclusive-warp-scan-init-value-aggregate //! //! Suppose the set of input ``thread_data`` across the block of threads is - //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first - //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be - //! ``32, 32, 34, 34, ..., 62, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned - //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc. + //! ``{1, 1, 1, 1, ..., 1}``. For initial value equal to 3, the corresponding output + //! ``thread_data`` for a sum operation in the first warp would be + //! ``{4, 5, 6, 7, ..., 35}``, the output for the second warp would be + //! ``{4, 5, 6, 7, ..., 35}``, etc. Furthermore, ``warp_aggregate`` would be assigned + //! ``32`` for threads in each warp. //! @endrst //! //! @tparam ScanOp diff --git a/cub/test/catch2_test_warp_scan_api.cu b/cub/test/catch2_test_warp_scan_api.cu index acca315305f..8e41eb8f0cb 100644 --- a/cub/test/catch2_test_warp_scan_api.cu +++ b/cub/test/catch2_test_warp_scan_api.cu @@ -31,12 +31,32 @@ #include #include -#include - #include "catch2_test_helper.h" +#include "cuda/std/__algorithm/fill.h" +#include "cuda/std/__algorithm/max.h" +#include "cuda/std/__numeric/inclusive_scan.h" +#include "cuda/std/__numeric/iota.h" constexpr int num_warps = 4; +template +struct max_op +{ + __host__ __device__ T const& operator()(T const& i, T const& j) + { + return cuda::std::max(i, j); + } +}; + +template +struct sum_op +{ + __host__ __device__ T operator()(T const& i, T const& j) + { + return i + j; + } +}; + // example-begin inclusive-warp-scan-init-value __global__ void InclusiveScanKernel(int* output) { @@ -45,26 +65,26 @@ __global__ void InclusiveScanKernel(int* output) // Allocate WarpScan shared memory for 4 warps __shared__ typename warp_scan_t::TempStorage temp_storage[num_warps]; - int initial_value = 1; - int thread_data = threadIdx.x; + int warp_id = threadIdx.x / 32; + int initial_value = 3; + int thread_data = threadIdx.x % 32 + warp_id; - // warp #0 input: { 0, 1, 2, 3, 4, ..., 31} - // warp #1 input: {32, 33, 34, 35, 36, ..., 63} - // warp #2 input: {64, 65, 66, 67, 68, ..., 95} - // warp #4 input: {96, 97, 98, 99, 100, ..., 127} + // warp #0 input: {0, 1, 2, 3, ..., 31} + // warp #1 input: {1, 2, 3, 4, ..., 32} + // warp #2 input: {2, 3, 4, 5, ..., 33} + // warp #4 input: {3, 4, 5, 6, ..., 34} // Collectively compute the block-wide inclusive prefix max scan - int warp_id = threadIdx.x / 32; - warp_scan_t(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, initial_value, cub::Sum()); + warp_scan_t(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, initial_value, cub::Max()); - // initial value = 1 (for each warp) - // warp #0 output: { 1, 2, 4, ..., 497} - // warp #1 output: {33, 66, 100, ..., 1521} - // warp #2 output: {65, 130, 196, ..., 2545} - // warp #3 output: {97, 194, 292, ..., 3569} + // initial value = 3 (for each warp) + // warp #0 output: {3, 3, 3, 3, ..., 31} + // warp #1 output: {3, 3, 3, 4, ..., 32} + // warp #2 output: {3, 3, 4, 5, ..., 33} + // warp #3 output: {3, 4, 5, 6, ..., 34} + output[threadIdx.x] = thread_data; // example-end inclusive-warp-scan-init-value - output[threadIdx.x] = thread_data; } CUB_TEST("Block array-based inclusive scan works with initial value", "[scan][block]") @@ -76,24 +96,21 @@ CUB_TEST("Block array-based inclusive scan works with initial value", "[scan][bl REQUIRE(cudaSuccess == cudaDeviceSynchronize()); c2h::host_vector expected(d_out.size()); - expected[0] = 1; // Initial value - // Calculate the prefix sum with an additional +1 every 32 elements - for (int i = 1; i < num_warps * 32; ++i) + for (int i = 0; i < num_warps; ++i) { - if (i % 32 == 0) - { - expected[i] = i + 1; // Reset at the start of each warp - } - else - { - expected[i] = expected[i - 1] + i; - } + auto start = expected.begin() + i * 32; + auto end = start + 32; + + cuda::std::iota(start, end, i); // initialize host input for every warp + + cuda::std::inclusive_scan(start, end, start, max_op{}, 3); } REQUIRE(expected == d_out); } +// example-begin inclusive-warp-scan-init-value-aggregate __global__ void InclusiveScanKernelAggr(int* output, int* d_warp_aggregate) { // Specialize WarpScan for type int @@ -101,28 +118,27 @@ __global__ void InclusiveScanKernelAggr(int* output, int* d_warp_aggregate) // Allocate WarpScan shared memory for 4 warps __shared__ typename warp_scan_t::TempStorage temp_storage[num_warps]; - int initial_value = 1; - int thread_data = threadIdx.x; + int warp_id = threadIdx.x / 32; + int initial_value = 3; // for each warp + int thread_data = 1; + int warp_aggregate; - // warp #0 input: { 0, 1, 2, 3, 4, ..., 31} - // warp #1 input: {32, 33, 34, 35, 36, ..., 63} - // warp #2 input: {64, 65, 66, 67, 68, ..., 95} - // warp #4 input: {96, 97, 98, 99, 100, ..., 127} + // warp #0 input: {1, 1, 1, 1, ..., 1} + // warp #1 input: {1, 1, 1, 1, ..., 1} + // warp #2 input: {1, 1, 1, 1, ..., 1} + // warp #4 input: {1, 1, 1, 1, ..., 1} // Collectively compute the block-wide inclusive prefix max scan - int warp_aggregate; - int warp_id = threadIdx.x / 32; warp_scan_t(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, initial_value, cub::Sum(), warp_aggregate); - // initial value = 1 (for each warp) - // warp #0 output: { 1, 2, 4, ..., 497} - aggregate: 496 - // warp #1 output: {33, 66, 100, ..., 1521} - aggregate: 1520 - // warp #2 output: {65, 130, 196, ..., 2545} - aggregate: 2544 - // warp #3 output: {97, 194, 292, ..., 3569} - aggregate: 3568 + // warp #1 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32 + // warp #2 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32 + // warp #0 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32 + // warp #3 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32 - // example-end inclusive-warp-scan-init-value - d_warp_aggregate[warp_id] = warp_aggregate; + // example-end inclusive-warp-scan-init-value-aggregate output[threadIdx.x] = thread_data; + d_warp_aggregate[warp_id] = warp_aggregate; } CUB_TEST("Block array-based inclusive scan aggregate works with initial value", "[scan][block]") @@ -137,28 +153,20 @@ CUB_TEST("Block array-based inclusive scan aggregate works with initial value", c2h::host_vector expected(d_out.size()); c2h::host_vector expected_aggr{}; - expected[0] = 1; // Initial value - // Calculate the prefix sum with an additional +1 every 32 elements - for (int i = 1; i < num_warps * 32; ++i) + for (int i = 0; i < num_warps; ++i) { - if (i % 32 == 0) - { - expected[i] = i + 1; // Reset at the start of each warp - } - else - { - expected[i] = expected[i - 1] + i; - } - - // fetch the aggregate at the end of each warp - if (i % 32 == 0) - { - expected_aggr.push_back(expected[i - 1] - 1); // warp aggregate doed not take - // initial value into account - } + auto start = expected.begin() + i * 32; + auto end = start + 32; + int init_val = 3; + + cuda::std::fill(start, end, 1); // initialize host input for every warp + + cuda::std::inclusive_scan(start, end, start, sum_op{}, init_val); + + expected_aggr.push_back(expected[i * 32 + 31] - init_val); // warp aggregate doed not take + // initial value into account } - expected_aggr.push_back(expected.back() - 1); REQUIRE(expected == d_out); REQUIRE(expected_aggr == d_warp_aggregate);