From fd6e83632270596af6385f2323676c43dfe4a436 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 27 Aug 2024 18:09:58 -0700 Subject: [PATCH 01/13] cub::DeviceFind::FindIf dummy implementation and test --- cub/cub/device/device_find_if.cuh | 165 +++++++++++++++++++++++++ cub/test/catch2_test_device_find_if.cu | 123 ++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 cub/cub/device/device_find_if.cuh create mode 100644 cub/test/catch2_test_device_find_if.cu diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh new file mode 100644 index 00000000000..d8f12d36a99 --- /dev/null +++ b/cub/cub/device/device_find_if.cuh @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +//! @file +//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data +//! items residing within device-accessible memory. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +CUB_NAMESPACE_BEGIN + +template +__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result) +{ + extern __shared__ int sresult[]; + sresult[0] = INT_MAX; + __syncthreads(); + + auto global_index = threadIdx.x + blockIdx.x * blockDim.x; + int total_threads = gridDim.x * blockDim.x; + + // traverse the sequence + for (auto index = global_index; begin + index < end; index += total_threads) + { + // Only one thread reads atomically and propagates it to the + // the rest threads of the block through shared memory + if (threadIdx.x == 0) + { + sresult[0] = atomicAdd(result, 0); + } + __syncthreads(); + + if (sresult[0] < index) + { // @georgii early exit!!! + // printf("early exit!!!"); + return; // this returns the whole block + } + + if (pred(*(begin + index))) + { + atomicMin(result, + index); // @georgii atomic min per your request makes sense + // printf("%d\n", *result); + return; + } + } +} + +template +__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out) +{ + *d_out = *d_temp_storage; +} + +template +__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items) +{ + *d_temp_storage = num_items; +} + +struct DeviceFind +{ + template + CUB_RUNTIME_FUNCTION static void FindIf( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT op, + NumItemsT num_items, + cudaStream_t stream = 0) + { + int block_threads = 128; + // first cub API call + if (d_temp_storage == nullptr) + { + temp_storage_bytes = sizeof(int); + return; + } + int* int_temp_storage = static_cast(d_temp_storage); + + // Get device ordinal + int device_ordinal; + cudaError error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { + return; + } + + // Get SM count + int sm_count; + error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + if (cudaSuccess != error) + { + return; + } + + int find_if_sm_occupancy; + error = CubDebug( + cub::MaxSmOccupancy(find_if_sm_occupancy, find_if, block_threads)); + if (cudaSuccess != error) + { + return; + } + + int findif_device_occupancy = find_if_sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = findif_device_occupancy; + + // use d_temp_storage as the intermediate device result + // to read and write from. Then store the final result in the output iterator. + cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items); + + find_if<<>>(d_in, d_in + num_items, op, int_temp_storage); + + write_final_result_in_output_iterator_already<<<1, 1>>>(int_temp_storage, d_out); + + return; + } +}; + +CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu new file mode 100644 index 00000000000..2cb16b9d297 --- /dev/null +++ b/cub/test/catch2_test_device_find_if.cu @@ -0,0 +1,123 @@ +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "c2h/custom_type.cuh" +#include "catch2_test_device_reduce.cuh" +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" +#include + +// %PARAM% TEST_LAUNCH lid 0:1 + +// DECLARE_LAUNCH_WRAPPER(cub::DeviceFind::FindIf, device_findif); + +// List of types to test +using custom_t = + c2h::custom_type_t; + +using full_type_list = c2h::type_list, type_pair>; +// clang-format on + +enum class gen_data_t : int +{ + /// Uniform random data generation + GEN_TYPE_RANDOM, + /// Constant value as input data + GEN_TYPE_CONST +}; + +template +void compute_find_if_reference(InputIt first, InputIt last, OutputIt& result, BinaryOp op) +{ + auto pos = thrust::find_if(first, last, op); + result = pos - first; +} + +template +struct equals_2 +{ + __device__ __host__ bool operator()(T i) + { + return i == 2; + } +}; + +CUB_TEST("Device find if works", "[device]", full_type_list) +{ + using params = params_t; + using input_t = typename params::item_t; + using output_t = typename params::output_t; + using offset_t = int32_t; + + constexpr offset_t min_items = 1; + constexpr offset_t max_items = 1000000; + + // Generate the input sizes to test for + const offset_t num_items = GENERATE_COPY( + take(3, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + + // Input data generation to test + const gen_data_t data_gen_mode = GENERATE_COPY(gen_data_t::GEN_TYPE_RANDOM, gen_data_t::GEN_TYPE_CONST); + + // Generate input data + c2h::device_vector in_items(num_items); + if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM) + { + c2h::gen(CUB_SEED(2), in_items); + } + else + { + input_t default_constant{}; + init_default_constant(default_constant); + thrust::fill(c2h::device_policy, in_items.begin(), in_items.end(), default_constant); + } + auto d_in_it = thrust::raw_pointer_cast(in_items.data()); + + SECTION("find if") + { + using op_t = equals_2; + + // Prepare verification data + c2h::host_vector host_items(in_items); + c2h::host_vector expected_result(1); + compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{}); + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes{}; + + // Run test + c2h::device_vector out_result(1); + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + + cub::DeviceFind::FindIf( + d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceFind::FindIf( + d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items); + + // Verify result + REQUIRE(expected_result == out_result); + } +} From afd42cbef6ff7c12686d056eacaaa96ec76e66e6 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 4 Sep 2024 21:55:45 -0700 Subject: [PATCH 02/13] Calculate d_temp_storage bytes properly aligned and use grid_size according to tile size --- cub/cub/device/device_find_if.cuh | 39 +++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index d8f12d36a99..f40d048286b 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -34,6 +34,8 @@ #include +#include "cub/util_type.cuh" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -112,14 +114,10 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { - int block_threads = 128; - // first cub API call - if (d_temp_storage == nullptr) - { - temp_storage_bytes = sizeof(int); - return; - } - int* int_temp_storage = static_cast(d_temp_storage); + int block_threads = 128; + int items_per_thread = 2; + int tile_size = block_threads * items_per_thread; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Get device ordinal int device_ordinal; @@ -148,13 +146,34 @@ struct DeviceFind int findif_device_occupancy = find_if_sm_occupancy * sm_count; // Even-share work distribution - int max_blocks = findif_device_occupancy; + int max_blocks = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big + + int findif_grid_size = CUB_MIN(num_tiles, max_blocks); + + // Temporary storage allocation requirements + void* allocations[1] = {}; + size_t allocation_sizes[1] = {sizeof(int)}; + + // Alias the temporary allocations from the single storage blob (or + // compute the necessary size of the blob) + error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + return; + } + + int* int_temp_storage = static_cast(allocations[0]); // this shouldn't be just int + + if (d_temp_storage == nullptr) + { + return; + } // use d_temp_storage as the intermediate device result // to read and write from. Then store the final result in the output iterator. cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items); - find_if<<>>(d_in, d_in + num_items, op, int_temp_storage); + find_if<<>>(d_in, d_in + num_items, op, int_temp_storage); write_final_result_in_output_iterator_already<<<1, 1>>>(int_temp_storage, d_out); From 7610da68ed3f4ef8ac1754dc8226eaafa834fe28 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 4 Sep 2024 21:56:24 -0700 Subject: [PATCH 03/13] Add cub::Device::FindIf nvbench benchmark --- cub/benchmarks/bench/find_if/base.cu | 127 +++++++++++++++++++++++++++ cub/cub/device/device_find_if.cuh | 13 ++- 2 files changed, 132 insertions(+), 8 deletions(-) create mode 100644 cub/benchmarks/bench/find_if/base.cu diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu new file mode 100644 index 00000000000..25312b158aa --- /dev/null +++ b/cub/benchmarks/bench/find_if/base.cu @@ -0,0 +1,127 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include + +#include +#include + +#include + +template +struct equals_100 +{ + __device__ bool operator()(T i) + { + return i == 1; + } // @amd you 'll never find out the secret sauce +}; + +template +void find_if(nvbench::state& state, nvbench::type_list) +{ + // set up input + const auto elements = state.get_int64("Elements"); + const auto common_prefix = state.get_float64("CommonPrefixRatio"); + const auto same_elements = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); + thrust::device_vector d_result(1); + /// + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes{}; + + cub::DeviceFind::FindIf( + d_temp_storage, temp_storage_bytes, dinput.begin(), d_result.begin(), equals_100{}, dinput.size(), 0); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { + cub::DeviceFind::FindIf( + d_temp_storage, + temp_storage_bytes, + dinput.begin(), + d_result.begin(), + equals_100{}, + dinput.size(), + launch.get_stream()); + }); +} +NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); + +////////////////////////////////////////////////////// +template +void thrust_find_if(nvbench::state& state, nvbench::type_list) +{ + // set up input + const auto elements = state.get_int64("Elements"); + const auto common_prefix = state.get_float64("CommonPrefixRatio"); + const auto same_elements = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); + /// + + caching_allocator_t alloc; + thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); + }); +} +NVBENCH_BENCH_TYPES(thrust_find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); + +////////////////////////////////////////////////////// +template +void thrust_count_if(nvbench::state& state, nvbench::type_list) +{ + // set up input + const auto elements = state.get_int64("Elements"); + const auto common_prefix = state.get_float64("CommonPrefixRatio"); + const auto same_elements = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); + /// + + caching_allocator_t alloc; + thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); + }); +} +NVBENCH_BENCH_TYPES(thrust_count_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index f40d048286b..f0663b92ab9 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -56,9 +56,7 @@ CUB_NAMESPACE_BEGIN template __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result) { - extern __shared__ int sresult[]; - sresult[0] = INT_MAX; - __syncthreads(); + __shared__ int sresult; auto global_index = threadIdx.x + blockIdx.x * blockDim.x; int total_threads = gridDim.x * blockDim.x; @@ -70,14 +68,13 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result) // the rest threads of the block through shared memory if (threadIdx.x == 0) { - sresult[0] = atomicAdd(result, 0); + sresult = atomicAdd(result, 0); } __syncthreads(); - if (sresult[0] < index) - { // @georgii early exit!!! - // printf("early exit!!!"); - return; // this returns the whole block + if (sresult < index) + { + return; // early exit } if (pred(*(begin + index))) From e683a3dcbbc47e1398b83f6be21a700902009409 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 9 Sep 2024 15:40:47 -0700 Subject: [PATCH 04/13] Introduced tile based loop --- cub/cub/device/device_find_if.cuh | 41 +++++++++++++++++-------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index f0663b92ab9..08d3d549bbe 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -34,7 +34,7 @@ #include -#include "cub/util_type.cuh" +#include "device_launch_parameters.h" #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header @@ -54,15 +54,13 @@ CUB_NAMESPACE_BEGIN template -__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result) +__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) { + int elements_per_thread = 2; + auto tile_size = blockDim.x * elements_per_thread; __shared__ int sresult; - auto global_index = threadIdx.x + blockIdx.x * blockDim.x; - int total_threads = gridDim.x * blockDim.x; - - // traverse the sequence - for (auto index = global_index; begin + index < end; index += total_threads) + for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) { // Only one thread reads atomically and propagates it to the // the rest threads of the block through shared memory @@ -72,17 +70,24 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result) } __syncthreads(); - if (sresult < index) + for (int i = 0; i < elements_per_thread; ++i) { - return; // early exit - } - - if (pred(*(begin + index))) - { - atomicMin(result, - index); // @georgii atomic min per your request makes sense - // printf("%d\n", *result); - return; + auto index = tile_offset + threadIdx.x + i * blockDim.x; + + if (index < num_items) + { + // early exit + if (sresult < index) + { + return; + } + + if (pred(*(begin + index))) + { + atomicMin(result, index); + return; + } + } } } } @@ -170,7 +175,7 @@ struct DeviceFind // to read and write from. Then store the final result in the output iterator. cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items); - find_if<<>>(d_in, d_in + num_items, op, int_temp_storage); + find_if<<>>(d_in, d_in + num_items, op, int_temp_storage, num_items); write_final_result_in_output_iterator_already<<<1, 1>>>(int_temp_storage, d_out); From b7a6db50bcd4a276d9d080419f26daf4fbae2232 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 11 Sep 2024 08:56:48 -0700 Subject: [PATCH 05/13] Use multiple elements per thread --- cub/cub/device/device_find_if.cuh | 17 +++++++------- cub/test/catch2_test_device_find_if.cu | 31 ++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index 08d3d549bbe..f6e1ce95cd3 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -1,6 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -51,13 +50,15 @@ #include #include +#define elements_per_thread 16 + CUB_NAMESPACE_BEGIN template __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) { - int elements_per_thread = 2; - auto tile_size = blockDim.x * elements_per_thread; + // int elements_per_thread = 32; + auto tile_size = blockDim.x * elements_per_thread; __shared__ int sresult; for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) @@ -116,10 +117,10 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { - int block_threads = 128; - int items_per_thread = 2; - int tile_size = block_threads * items_per_thread; - int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); + int block_threads = 128; + // int items_per_thread = 2; + int tile_size = block_threads * elements_per_thread; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Get device ordinal int device_ordinal; diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu index 2cb16b9d297..b92030c9eaa 100644 --- a/cub/test/catch2_test_device_find_if.cu +++ b/cub/test/catch2_test_device_find_if.cu @@ -1,3 +1,30 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first @@ -9,10 +36,6 @@ #include #include -#include -#include -#include - #include "c2h/custom_type.cuh" #include "catch2_test_device_reduce.cuh" #include "catch2_test_helper.h" From f69e68846947ceead2bb0ebf793cbca92c15af21 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 17 Sep 2024 10:41:40 -0700 Subject: [PATCH 06/13] Optimized solution --- cub/cub/device/device_find_if.cuh | 32 ++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index f6e1ce95cd3..a645680014d 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -60,6 +60,12 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st // int elements_per_thread = 32; auto tile_size = blockDim.x * elements_per_thread; __shared__ int sresult; + __shared__ int block_result; + + if (threadIdx.x == 0) + { + block_result = num_items; + } for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) { @@ -71,22 +77,34 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st } __syncthreads(); + // early exit + if (sresult < tile_offset) + { + return; + } + + bool found = false; for (int i = 0; i < elements_per_thread; ++i) { auto index = tile_offset + threadIdx.x + i * blockDim.x; if (index < num_items) { - // early exit - if (sresult < index) + if (pred(*(begin + index))) { - return; + found = true; + atomicMin(&block_result, index); + break; } - - if (pred(*(begin + index))) + } + } + if (syncthreads_or(found)) + { + if (threadIdx.x == 0) + { + if (block_result < num_items) { - atomicMin(result, index); - return; + atomicMin(result, block_result); } } } From 81c676eadf8c821d2432f1198947321803958125 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 19 Sep 2024 19:27:13 -0700 Subject: [PATCH 07/13] Add vectorized loads --- cub/cub/device/device_find_if.cuh | 38 +++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index a645680014d..365e03cd593 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -33,6 +33,7 @@ #include +#include "cub/util_type.cuh" #include "device_launch_parameters.h" #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -50,13 +51,22 @@ #include #include -#define elements_per_thread 16 +static constexpr int elements_per_thread = 16; +static constexpr int _VECTOR_LOAD_LENGTH = 4; +static constexpr int block_threads = 128; CUB_NAMESPACE_BEGIN template __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) { + using InputT = cub::detail::value_t; + using VectorT = typename CubVector::Type; + enum + { + WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH + }; + // int elements_per_thread = 32; auto tile_size = blockDim.x * elements_per_thread; __shared__ int sresult; @@ -83,14 +93,35 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st return; } + //// vectorized loads begin + InputT* d_in_unqualified = const_cast(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH); + + cub::CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); + + InputT input_items[elements_per_thread]; + VectorT* vec_items = reinterpret_cast(input_items); + +#pragma unroll + for (int i = 0; i < WORDS; ++i) + { + vec_items[i] = d_vec_in[block_threads * i]; + } + //// vectorized loads end + // what about when input is not devisible by _VECTOR_LOAD_LENGTH? --> case for not full tile to be added + bool found = false; for (int i = 0; i < elements_per_thread; ++i) { - auto index = tile_offset + threadIdx.x + i * blockDim.x; + int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset; + // i % WORDS = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static) + // (i / WORDS) * block_threads * WORDS = + 0 , 64 , 128, ... (static) + // threadIdx.x * WORDS = + 0, 4, 8, ... offset of the thread within working tile + // tile_offset = + just start at the beginning of the block if (index < num_items) { - if (pred(*(begin + index))) + if (pred(input_items[i])) { found = true; atomicMin(&block_result, index); @@ -135,7 +166,6 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { - int block_threads = 128; // int items_per_thread = 2; int tile_size = block_threads * elements_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); From b8b86dd08c8ec66f69ae6d56d1538bc3c891a234 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 20 Sep 2024 17:13:25 -0700 Subject: [PATCH 08/13] Separate benchmarks into different files --- cub/benchmarks/bench/find_if/base.cu | 87 +++++++---------------- cub/cub/device/device_find_if.cuh | 5 ++ thrust/benchmarks/bench/count_if/basic.cu | 71 ++++++++++++++++++ thrust/benchmarks/bench/find_if/basic.cu | 71 ++++++++++++++++++ 4 files changed, 173 insertions(+), 61 deletions(-) create mode 100644 thrust/benchmarks/bench/count_if/basic.cu create mode 100644 thrust/benchmarks/bench/find_if/basic.cu diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu index 25312b158aa..8183a04a145 100644 --- a/cub/benchmarks/bench/find_if/base.cu +++ b/cub/benchmarks/bench/find_if/base.cu @@ -33,24 +33,30 @@ #include template -struct equals_100 +struct equals { - __device__ bool operator()(T i) + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) { - return i == 1; - } // @amd you 'll never find out the secret sauce + return i == val; + } }; template void find_if(nvbench::state& state, nvbench::type_list) { + T val = 1; // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; + const auto elements = state.get_int64("Elements"); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); thrust::device_vector d_result(1); /// @@ -58,7 +64,13 @@ void find_if(nvbench::state& state, nvbench::type_list) size_t temp_storage_bytes{}; cub::DeviceFind::FindIf( - d_temp_storage, temp_storage_bytes, dinput.begin(), d_result.begin(), equals_100{}, dinput.size(), 0); + d_temp_storage, + temp_storage_bytes, + thrust::raw_pointer_cast(dinput.data()), + thrust::raw_pointer_cast(d_result.data()), + equals{val}, + dinput.size(), + 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); @@ -67,61 +79,14 @@ void find_if(nvbench::state& state, nvbench::type_list) cub::DeviceFind::FindIf( d_temp_storage, temp_storage_bytes, - dinput.begin(), - d_result.begin(), - equals_100{}, + thrust::raw_pointer_cast(dinput.data()), + thrust::raw_pointer_cast(d_result.data()), + equals{val}, dinput.size(), launch.get_stream()); }); } -NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) - .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); - -////////////////////////////////////////////////////// -template -void thrust_find_if(nvbench::state& state, nvbench::type_list) -{ - // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; - - thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); - /// - - caching_allocator_t alloc; - thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); - - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); - }); -} -NVBENCH_BENCH_TYPES(thrust_find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) - .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); - -////////////////////////////////////////////////////// -template -void thrust_count_if(nvbench::state& state, nvbench::type_list) -{ - // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; - thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); - /// - - caching_allocator_t alloc; - thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); - - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); - }); -} -NVBENCH_BENCH_TYPES(thrust_count_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) +NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types)) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0}); diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index 365e03cd593..0adf8aeedca 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -51,6 +51,8 @@ #include #include +#include + static constexpr int elements_per_thread = 16; static constexpr int _VECTOR_LOAD_LENGTH = 4; static constexpr int block_threads = 128; @@ -129,6 +131,7 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st } } } + if (syncthreads_or(found)) { if (threadIdx.x == 0) @@ -166,6 +169,8 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { + static_assert(elements_per_thread % _VECTOR_LOAD_LENGTH == 0, "No full tile at the end"); + // int items_per_thread = 2; int tile_size = block_threads * elements_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); diff --git a/thrust/benchmarks/bench/count_if/basic.cu b/thrust/benchmarks/bench/count_if/basic.cu new file mode 100644 index 00000000000..b672d789868 --- /dev/null +++ b/thrust/benchmarks/bench/count_if/basic.cu @@ -0,0 +1,71 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include + +#include "nvbench_helper.cuh" + +template +struct equals +{ + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) + { + return i == val; + } +}; + +template +void count_if(nvbench::state& state, nvbench::type_list) +{ + T val = 1; + // set up input + const auto elements = static_cast(state.get_int64("Elements")); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); + /// + + caching_allocator_t alloc; + thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals{val}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals{val}); + }); +} + +NVBENCH_BENCH_TYPES(count_if, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("thrust::count_if") + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0}); diff --git a/thrust/benchmarks/bench/find_if/basic.cu b/thrust/benchmarks/bench/find_if/basic.cu new file mode 100644 index 00000000000..8a2618e26d5 --- /dev/null +++ b/thrust/benchmarks/bench/find_if/basic.cu @@ -0,0 +1,71 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include + +#include "nvbench_helper.cuh" + +template +struct equals +{ + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) + { + return i == val; + } +}; + +template +void find_if(nvbench::state& state, nvbench::type_list) +{ + T val = 1; + // set up input + const auto elements = static_cast(state.get_int64("Elements")); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); + /// + + caching_allocator_t alloc; + thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals{val}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals{val}); + }); +} + +NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("thrust::find_if") + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0}); From f93ff9b21a1a5aec28c4c889537261a1847cae0d Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 1 Oct 2024 13:09:24 -0700 Subject: [PATCH 09/13] Add split implementation that works with both vectorized and non vectorized loads --- cub/cub/device/device_find_if.cuh | 199 +++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 55 deletions(-) diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index 0adf8aeedca..706201a274a 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -34,6 +34,9 @@ #include #include "cub/util_type.cuh" +#include "cuda/std/__cccl/dialect.h" +#include "cuda/std/__memory/pointer_traits.h" +#include "cuda/std/__utility/declval.h" #include "device_launch_parameters.h" #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -59,89 +62,178 @@ static constexpr int block_threads = 128; CUB_NAMESPACE_BEGIN -template -__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) +template +static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile( + Iterator d_in, int tile_offset, int tile_size, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) +{ + /// Create an AgentFindIf and extract these two as type member in the encapsulating struct + using InputT = cub::detail::value_t; + using VectorT = typename CubVector::Type; + /// + bool full_tile = (tile_offset + tile_size) <= num_items; + bool is_aligned = (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + return full_tile && is_aligned; +} + +template +static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile( + Iterator /*d_in*/, + int /*tile_offset*/, + int /*tile_size*/, + std::size_t /*num_items*/, + Int2Type /*CAN_VECTORIZE*/) { + return false; +} + +template +__device__ void ConsumeRange( + IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) +{ + __shared__ int block_result; + + if (threadIdx.x == 0) + { + block_result = num_items; + } + using InputT = cub::detail::value_t; using VectorT = typename CubVector::Type; + enum { WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH }; + //// vectorized loads begin + InputT* d_in_unqualified = const_cast(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH); - // int elements_per_thread = 32; - auto tile_size = blockDim.x * elements_per_thread; - __shared__ int sresult; - __shared__ int block_result; + cub::CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); - if (threadIdx.x == 0) + InputT input_items[elements_per_thread]; + VectorT* vec_items = reinterpret_cast(input_items); + +#pragma unroll + for (int i = 0; i < WORDS; ++i) { - block_result = num_items; + vec_items[i] = d_vec_in[block_threads * i]; } + //// vectorized loads end - for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) + bool found = false; + for (int i = 0; i < elements_per_thread; ++i) { - // Only one thread reads atomically and propagates it to the - // the rest threads of the block through shared memory - if (threadIdx.x == 0) + int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset; + // i % WORDS = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static) + // (i / WORDS) * block_threads * WORDS = + 0 , 64 , 128, ... (static) + // threadIdx.x * WORDS = + 0, 4, 8, ... offset of the thread within working tile + // tile_offset = + just start at the beginning of the block + + if (index < num_items) { - sresult = atomicAdd(result, 0); + if (pred(input_items[i])) + { + found = true; + atomicMin(&block_result, index); + break; // every thread goes over multiple elements per thread + // for every tile. If a thread finds a local minimum it doesn't + // need to proceed further (inner early exit). + } } - __syncthreads(); + } - // early exit - if (sresult < tile_offset) + if (syncthreads_or(found)) + { + if (threadIdx.x == 0) { - return; + if (block_result < num_items) + { + atomicMin(result, block_result); + } } + } +} - //// vectorized loads begin - InputT* d_in_unqualified = const_cast(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH); +template +__device__ void ConsumeRange( + IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) +{ + __shared__ int block_result; - cub::CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); + if (threadIdx.x == 0) + { + block_result = num_items; + } - InputT input_items[elements_per_thread]; - VectorT* vec_items = reinterpret_cast(input_items); + bool found = false; + for (int i = 0; i < elements_per_thread; ++i) + { + auto index = tile_offset + threadIdx.x + i * blockDim.x; -#pragma unroll - for (int i = 0; i < WORDS; ++i) + if (index < num_items) { - vec_items[i] = d_vec_in[block_threads * i]; + if (pred(*(begin + index))) + { + found = true; + atomicMin(&block_result, index); + break; + } } - //// vectorized loads end - // what about when input is not devisible by _VECTOR_LOAD_LENGTH? --> case for not full tile to be added - - bool found = false; - for (int i = 0; i < elements_per_thread; ++i) + } + if (syncthreads_or(found)) + { + if (threadIdx.x == 0) { - int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset; - // i % WORDS = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static) - // (i / WORDS) * block_threads * WORDS = + 0 , 64 , 128, ... (static) - // threadIdx.x * WORDS = + 0, 4, 8, ... offset of the thread within working tile - // tile_offset = + just start at the beginning of the block - - if (index < num_items) + if (block_result < num_items) { - if (pred(input_items[i])) - { - found = true; - atomicMin(&block_result, index); - break; - } + atomicMin(result, block_result); } } + } +} + +template +__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) +{ + using InputT = cub::detail::value_t; + + // 1. _VECTOR_LOAD_LENGTH > 1: number of items per vectorized load should have been determined to be more than 1. + // Now it's hardcoded but it will be determined at compile time according to the GPU architecture (Policy) later + // on. + // 2. elements_per_thread % _VECTOR_LOAD_LENGTH == 0: elements_per_thread is also defined at compile time after tuning + // for specific architectures. There is not point in doing vectorization if a thread cannot vectorize load all the + // elmenets it is going to be working on, on a single tile. + // 3. ::cuda::std::is_pointer::value: is contiguous iterator. If memory is not contiguous loading + // vectorized memory makes no sense. Defined at compile time. + // 4. Traits>::PRIMITIVE: InputT cannot be an arbitrary type, that could be layed out + // in memory in any way. Needs to be known. Compile time. + + static constexpr bool ATTEMPT_VECTORIZATION = + (_VECTOR_LOAD_LENGTH > 1) && (elements_per_thread % _VECTOR_LOAD_LENGTH == 0) + && (::cuda::std::is_pointer::value) && Traits::PRIMITIVE; + + auto tile_size = blockDim.x * elements_per_thread; + __shared__ int sresult; + + for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) + { + // Only one thread reads atomically and propagates it to the + // the rest threads of the block through shared memory + if (threadIdx.x == 0) + { + sresult = atomicAdd(result, 0); + } + __syncthreads(); - if (syncthreads_or(found)) + // early exit + if (sresult < tile_offset) { - if (threadIdx.x == 0) - { - if (block_result < num_items) - { - atomicMin(result, block_result); - } - } + return; } + + IsAlignedAndFullTile(begin, tile_offset, tile_size, num_items, Int2Type()) + ? ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < true && ATTEMPT_VECTORIZATION > ()) + : ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } } @@ -169,9 +261,6 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { - static_assert(elements_per_thread % _VECTOR_LOAD_LENGTH == 0, "No full tile at the end"); - - // int items_per_thread = 2; int tile_size = block_threads * elements_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); From 50dc8f500eded88b1db0cd71cbc2e7908043b492 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 1 Oct 2024 13:09:41 -0700 Subject: [PATCH 10/13] Add elaborate unit testing --- cub/test/catch2_test_device_find_if.cu | 129 +++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 8 deletions(-) diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu index b92030c9eaa..2f7a3175049 100644 --- a/cub/test/catch2_test_device_find_if.cu +++ b/cub/test/catch2_test_device_find_if.cu @@ -36,10 +36,14 @@ #include #include +#include + #include "c2h/custom_type.cuh" #include "catch2_test_device_reduce.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" +#include "thrust/detail/raw_pointer_cast.h" +#include #include // %PARAM% TEST_LAUNCH lid 0:1 @@ -72,15 +76,20 @@ void compute_find_if_reference(InputIt first, InputIt last, OutputIt& result, Bi } template -struct equals_2 +struct equals { + T val; + equals(T _val) + : val(_val) + {} + __device__ __host__ bool operator()(T i) { - return i == 2; + return i == val; } }; -CUB_TEST("Device find if works", "[device]", full_type_list) +CUB_TEST("Device find_if works", "[device]", full_type_list) { using params = params_t; using input_t = typename params::item_t; @@ -115,14 +124,15 @@ CUB_TEST("Device find if works", "[device]", full_type_list) } auto d_in_it = thrust::raw_pointer_cast(in_items.data()); - SECTION("find if") + SECTION("Generic find if case") { - using op_t = equals_2; + using op_t = equals; + input_t val_to_find{2}; // Prepare verification data c2h::host_vector host_items(in_items); c2h::host_vector expected_result(1); - compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{}); + compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{val_to_find}); void* d_temp_storage = nullptr; size_t temp_storage_bytes{}; @@ -132,13 +142,116 @@ CUB_TEST("Device find if works", "[device]", full_type_list) auto d_out_it = thrust::raw_pointer_cast(out_result.data()); cub::DeviceFind::FindIf( - d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items); + d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceFind::FindIf( + d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items); + + // Verify result + REQUIRE(expected_result == out_result); + } + + SECTION("find_if works with non raw pointers - .begin() iterator") + { + using op_t = equals; + input_t val_to_find{2}; + + // Prepare verification data + c2h::host_vector host_items(in_items); + c2h::host_vector expected_result(1); + compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{val_to_find}); + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes{}; + + // Run test + c2h::device_vector out_result(1); + + cub::DeviceFind::FindIf( + d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceFind::FindIf( - d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items); + d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items); + + // Verify result + REQUIRE(expected_result == out_result); + } + + SECTION("find_if works for unaligned input") + { + for (int offset = 1; offset < 4; ++offset) + { + if (num_items - offset > 0) + { + using op_t = equals; + input_t val_to_find{2}; + + // Prepare verification data + c2h::host_vector host_items(in_items); + c2h::host_vector expected_result(1); + compute_find_if_reference(host_items.begin() + offset, host_items.end(), expected_result[0], op_t{val_to_find}); + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes{}; + + // Run test + c2h::device_vector out_result(1); + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + + cub::DeviceFind::FindIf( + d_temp_storage, + temp_storage_bytes, + unwrap_it(d_in_it + offset), + unwrap_it(d_out_it), + op_t{val_to_find}, + num_items - offset); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceFind::FindIf( + d_temp_storage, + temp_storage_bytes, + unwrap_it(d_in_it + offset), + unwrap_it(d_out_it), + op_t{val_to_find}, + num_items - offset); + + // Verify result + REQUIRE(expected_result == out_result); + } + } + } + + SECTION("find_if works with non primitive iterator") + { + using op_t = equals; + input_t val_to_find{2}; + + // Prepare verification data + auto it = thrust::make_counting_iterator(0); // non-primitive iterator + c2h::host_vector expected_result(1); + compute_find_if_reference(it, it + num_items, expected_result[0], op_t{val_to_find}); + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes{}; + + // Run test + c2h::device_vector out_result(1); + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + + cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items); // Verify result REQUIRE(expected_result == out_result); From c508b994008c60e5827154fc98326f2403c2b986 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 29 Oct 2024 13:43:03 -0700 Subject: [PATCH 11/13] Add Dispatch and Agent layers --- cub/benchmarks/bench/find_if/base.cu | 3 - cub/cub/agent/agent_find.cuh | 309 ++++++++++++++++++++ cub/cub/device/device_find_if.cuh | 261 +---------------- cub/cub/device/dispatch/dispatch_find.cuh | 331 ++++++++++++++++++++++ cub/test/catch2_test_device_find_if.cu | 38 +-- 5 files changed, 661 insertions(+), 281 deletions(-) create mode 100644 cub/cub/agent/agent_find.cuh create mode 100644 cub/cub/device/dispatch/dispatch_find.cuh diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu index 8183a04a145..6c51a754b56 100644 --- a/cub/benchmarks/bench/find_if/base.cu +++ b/cub/benchmarks/bench/find_if/base.cu @@ -36,9 +36,6 @@ template struct equals { T val; - equals(T _val) - : val(_val) - {} __device__ __host__ bool operator()(T i) { diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh new file mode 100644 index 00000000000..80b35ea93a0 --- /dev/null +++ b/cub/cub/agent/agent_find.cuh @@ -0,0 +1,309 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * @file cub::AgentFind implements a stateful abstraction of CUDA thread + * blocks for participating in device-wide search. + */ + +#pragma once +#include + +#include +#include +#include + +CUB_NAMESPACE_BEGIN + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentFind + * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block + * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input) + * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load + * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements + */ +template > +struct AgentFindPolicy : ScalingType +{ + /// Number of items per vectorized load + static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH; + + /// Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; +}; + +template // @giannis OutputiteratorT not needed +struct AgentFind +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + using InputT = cub::detail::value_t; + + /// Vector type of InputT for data movement + using VectorT = typename CubVector::Type; + + /// Input iterator wrapper type (for applying cache modifier) + // Wrap the native input pointer with CacheModifiedInputIterator + // or directly use the supplied input iterator type + using WrappedInputIteratorT = + ::cuda::std::_If<::cuda::std::is_pointer::value, + CacheModifiedInputIterator, + InputIteratorT>; + + /// Constants + static constexpr int BLOCK_THREADS = AgentFindPolicy::BLOCK_THREADS; + static constexpr int ITEMS_PER_THREAD = AgentFindPolicy::ITEMS_PER_THREAD; + static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; + static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentFindPolicy::VECTOR_LOAD_LENGTH); + + // Can vectorize according to the policy if the input iterator is a native + // pointer to a primitive type + static constexpr bool ATTEMPT_VECTORIZATION = + (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) + && (::cuda::std::is_pointer::value) && Traits::PRIMITIVE; + + static constexpr CacheLoadModifier LOAD_MODIFIER = AgentFindPolicy::LOAD_MODIFIER; + + /// Shared memory type required by this thread block + using _TempStorage = OffsetT; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> + {}; + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& sresult; ///< Reference to temp_storage + InputIteratorT d_in; ///< Input data to find + // OutputIteratorT d_out; + // OffsetT num_items; + // OffsetT* value_temp_storage; + // WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to find + ScanOpT scan_op; ///< Binary reduction operator + + //--------------------------------------------------------------------- + // Utility + //--------------------------------------------------------------------- + + template + static _CCCL_DEVICE _CCCL_FORCEINLINE bool + IsAlignedAndFullTile(T* d_in, int tile_offset, int tile_size, OffsetT num_items, Int2Type /*CAN_VECTORIZE*/) + { + /// Create an AgentFindIf and extract these two as type member in the encapsulating struct + using InputT = T; + using VectorT = typename CubVector::Type; + /// + const bool full_tile = (tile_offset + tile_size) <= num_items; + const bool is_aligned = reinterpret_cast<::cuda::std::uintptr_t>(d_in) % uintptr_t{sizeof(VectorT)} == 0; + return full_tile && is_aligned; + } + + template + static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile( + Iterator /*d_in*/, + int /*tile_offset*/, + int /*tile_size*/, + std::size_t /*num_items*/, + Int2Type /*CAN_VECTORIZE*/) + { + return false; + } + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + /** + * @brief Constructor + * @param sresult Reference to temp_storage + * @param d_in Input data to search + * @param scan_op Binary scan operator + */ + _CCCL_DEVICE _CCCL_FORCEINLINE AgentFind(TempStorage& sresult, InputIteratorT d_in, ScanOpT scan_op) + : sresult(sresult.Alias()) + , d_in(d_in) + , scan_op(scan_op) + {} + + //--------------------------------------------------------------------- + // Tile consumption + //--------------------------------------------------------------------- + + template + __device__ void + ConsumeTile(int tile_offset, Pred pred, OffsetT* result, OffsetT num_items, Int2Type /*CAN_VECTORIZE*/) + { + using InputT = cub::detail::value_t; + using VectorT = typename CubVector::Type; + + __shared__ OffsetT block_result; + + if (threadIdx.x == 0) + { + block_result = num_items; + } + + __syncthreads(); + + enum + { + NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH + }; + //// vectorized loads begin + InputT* d_in_unqualified = const_cast(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); + + cub::CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); + + InputT input_items[ITEMS_PER_THREAD]; + VectorT* vec_items = reinterpret_cast(input_items); + +#pragma unroll + for (int i = 0; i < NUMBER_OF_VECTORS; ++i) + { + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + } + //// vectorized loads end + + bool found = false; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + OffsetT nth_vector_of_thread = i / VECTOR_LOAD_LENGTH; + OffsetT element_in_word = i % VECTOR_LOAD_LENGTH; + OffsetT vector_of_tile = nth_vector_of_thread * BLOCK_THREADS + threadIdx.x; + + OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_word; + + if (index < num_items) + { + if (pred(input_items[i])) + { + found = true; + atomicMin(&block_result, index); + break; // every thread goes over multiple elements per thread + // for every tile. If a thread finds a local minimum it doesn't + // need to proceed further (inner early exit). + } + } + } + + if (syncthreads_or(found)) + { + if (threadIdx.x == 0) + { + if (block_result < num_items) + { + atomicMin(result, block_result); + } + } + } + } + + template + __device__ void + ConsumeTile(int tile_offset, Pred pred, OffsetT* result, OffsetT num_items, Int2Type /*CAN_VECTORIZE*/) + { + __shared__ int block_result; + + if (threadIdx.x == 0) + { + block_result = num_items; + } + + __syncthreads(); + + bool found = false; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + auto index = tile_offset + threadIdx.x + i * blockDim.x; + + if (index < num_items) + { + if (pred(*(d_in + index))) + { + found = true; + atomicMin(&block_result, index); + break; + } + } + } + if (syncthreads_or(found)) + { + if (threadIdx.x == 0) + { + if (block_result < num_items) + { + atomicMin(result, block_result); + } + } + } + } + + __device__ void Process(OffsetT* value_temp_storage, OffsetT num_items) + { + for (int tile_offset = blockIdx.x * TILE_ITEMS; tile_offset < num_items; tile_offset += TILE_ITEMS * gridDim.x) + { + // Only one thread reads atomically and propagates it to the + // the rest threads of the block through shared memory + if (threadIdx.x == 0) + { + sresult = atomicAdd(value_temp_storage, 0); + } + __syncthreads(); + + // early exit + if (sresult < tile_offset) + { + return; + } + + IsAlignedAndFullTile(d_in, tile_offset, TILE_ITEMS, num_items, Int2Type()) + ? ConsumeTile(tile_offset, scan_op, value_temp_storage, num_items, Int2Type()) + : ConsumeTile(tile_offset, scan_op, value_temp_storage, num_items, Int2Type()); + } + } +}; + +CUB_NAMESPACE_END diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index 706201a274a..ee30f5c4392 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -49,6 +49,7 @@ #include #include +#include #include #include #include @@ -56,273 +57,27 @@ #include -static constexpr int elements_per_thread = 16; -static constexpr int _VECTOR_LOAD_LENGTH = 4; -static constexpr int block_threads = 128; - CUB_NAMESPACE_BEGIN -template -static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile( - Iterator d_in, int tile_offset, int tile_size, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) -{ - /// Create an AgentFindIf and extract these two as type member in the encapsulating struct - using InputT = cub::detail::value_t; - using VectorT = typename CubVector::Type; - /// - bool full_tile = (tile_offset + tile_size) <= num_items; - bool is_aligned = (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; - return full_tile && is_aligned; -} - -template -static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile( - Iterator /*d_in*/, - int /*tile_offset*/, - int /*tile_size*/, - std::size_t /*num_items*/, - Int2Type /*CAN_VECTORIZE*/) -{ - return false; -} - -template -__device__ void ConsumeRange( - IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) -{ - __shared__ int block_result; - - if (threadIdx.x == 0) - { - block_result = num_items; - } - - using InputT = cub::detail::value_t; - using VectorT = typename CubVector::Type; - - enum - { - WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH - }; - //// vectorized loads begin - InputT* d_in_unqualified = const_cast(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH); - - cub::CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); - - InputT input_items[elements_per_thread]; - VectorT* vec_items = reinterpret_cast(input_items); - -#pragma unroll - for (int i = 0; i < WORDS; ++i) - { - vec_items[i] = d_vec_in[block_threads * i]; - } - //// vectorized loads end - - bool found = false; - for (int i = 0; i < elements_per_thread; ++i) - { - int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset; - // i % WORDS = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static) - // (i / WORDS) * block_threads * WORDS = + 0 , 64 , 128, ... (static) - // threadIdx.x * WORDS = + 0, 4, 8, ... offset of the thread within working tile - // tile_offset = + just start at the beginning of the block - - if (index < num_items) - { - if (pred(input_items[i])) - { - found = true; - atomicMin(&block_result, index); - break; // every thread goes over multiple elements per thread - // for every tile. If a thread finds a local minimum it doesn't - // need to proceed further (inner early exit). - } - } - } - - if (syncthreads_or(found)) - { - if (threadIdx.x == 0) - { - if (block_result < num_items) - { - atomicMin(result, block_result); - } - } - } -} - -template -__device__ void ConsumeRange( - IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type /*CAN_VECTORIZE*/) -{ - __shared__ int block_result; - - if (threadIdx.x == 0) - { - block_result = num_items; - } - - bool found = false; - for (int i = 0; i < elements_per_thread; ++i) - { - auto index = tile_offset + threadIdx.x + i * blockDim.x; - - if (index < num_items) - { - if (pred(*(begin + index))) - { - found = true; - atomicMin(&block_result, index); - break; - } - } - } - if (syncthreads_or(found)) - { - if (threadIdx.x == 0) - { - if (block_result < num_items) - { - atomicMin(result, block_result); - } - } - } -} - -template -__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items) -{ - using InputT = cub::detail::value_t; - - // 1. _VECTOR_LOAD_LENGTH > 1: number of items per vectorized load should have been determined to be more than 1. - // Now it's hardcoded but it will be determined at compile time according to the GPU architecture (Policy) later - // on. - // 2. elements_per_thread % _VECTOR_LOAD_LENGTH == 0: elements_per_thread is also defined at compile time after tuning - // for specific architectures. There is not point in doing vectorization if a thread cannot vectorize load all the - // elmenets it is going to be working on, on a single tile. - // 3. ::cuda::std::is_pointer::value: is contiguous iterator. If memory is not contiguous loading - // vectorized memory makes no sense. Defined at compile time. - // 4. Traits>::PRIMITIVE: InputT cannot be an arbitrary type, that could be layed out - // in memory in any way. Needs to be known. Compile time. - - static constexpr bool ATTEMPT_VECTORIZATION = - (_VECTOR_LOAD_LENGTH > 1) && (elements_per_thread % _VECTOR_LOAD_LENGTH == 0) - && (::cuda::std::is_pointer::value) && Traits::PRIMITIVE; - - auto tile_size = blockDim.x * elements_per_thread; - __shared__ int sresult; - - for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x) - { - // Only one thread reads atomically and propagates it to the - // the rest threads of the block through shared memory - if (threadIdx.x == 0) - { - sresult = atomicAdd(result, 0); - } - __syncthreads(); - - // early exit - if (sresult < tile_offset) - { - return; - } - - IsAlignedAndFullTile(begin, tile_offset, tile_size, num_items, Int2Type()) - ? ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < false && ATTEMPT_VECTORIZATION > ()); - } -} - -template -__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out) -{ - *d_out = *d_temp_storage; -} - -template -__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items) -{ - *d_temp_storage = num_items; -} - struct DeviceFind { template - CUB_RUNTIME_FUNCTION static void FindIf( + CUB_RUNTIME_FUNCTION static cudaError_t FindIf( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - ScanOpT op, + ScanOpT scan_op, NumItemsT num_items, cudaStream_t stream = 0) { - int tile_size = block_threads * elements_per_thread; - int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); - - // Get device ordinal - int device_ordinal; - cudaError error = CubDebug(cudaGetDevice(&device_ordinal)); - if (cudaSuccess != error) - { - return; - } - - // Get SM count - int sm_count; - error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); - if (cudaSuccess != error) - { - return; - } - - int find_if_sm_occupancy; - error = CubDebug( - cub::MaxSmOccupancy(find_if_sm_occupancy, find_if, block_threads)); - if (cudaSuccess != error) - { - return; - } - - int findif_device_occupancy = find_if_sm_occupancy * sm_count; - - // Even-share work distribution - int max_blocks = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big - - int findif_grid_size = CUB_MIN(num_tiles, max_blocks); - - // Temporary storage allocation requirements - void* allocations[1] = {}; - size_t allocation_sizes[1] = {sizeof(int)}; - - // Alias the temporary allocations from the single storage blob (or - // compute the necessary size of the blob) - error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) - { - return; - } - - int* int_temp_storage = static_cast(allocations[0]); // this shouldn't be just int - - if (d_temp_storage == nullptr) - { - return; - } - - // use d_temp_storage as the intermediate device result - // to read and write from. Then store the final result in the output iterator. - cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items); - - find_if<<>>(d_in, d_in + num_items, op, int_temp_storage, num_items); + // CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceFind::FindIf"); - write_final_result_in_output_iterator_already<<<1, 1>>>(int_temp_storage, d_out); + // Signed integer type for global offsets + using OffsetT = detail::choose_offset_t; - return; + return DispatchFind::Dispatch( + d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), scan_op, stream); } }; diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh new file mode 100644 index 00000000000..52bf5f6efe8 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_find.cuh @@ -0,0 +1,331 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * @file cub::DeviceFind provides device-wide, parallel operations for + * computing search across a sequence of data items residing within + * device-accessible memory. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +#include + +#include "cub/util_type.cuh" + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +CUB_NAMESPACE_BEGIN + +template +__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out) +{ + *d_out = *d_temp_storage; +} + +template +__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items) +{ + *d_temp_storage = num_items; +} + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** ENTER DOCUMENTATION */ +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::FindPolicy::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceFindKernel( + InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, OffsetT* value_temp_storage, ScanOpT scan_op) +{ + using AgentFindT = + AgentFind; + + __shared__ typename AgentFindT::TempStorage sresult; + // __shared__ int temp_storage; + // Process tiles + AgentFindT agent(sresult, d_in, scan_op); // Seems like sresult can be defined and initialized in agent_find.cuh + // directly without having to pass it here as an argument. + + agent.Process(value_temp_storage, num_items); +} + +template +struct DeviceFindPolicy +{ + //--------------------------------------------------------------------------- + // Architecture-specific tuning policies + //--------------------------------------------------------------------------- + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy300> + { + static constexpr int threads_per_block = 128; + static constexpr int items_per_thread = 16; + static constexpr int items_per_vec_load = 4; + + // FindPolicy (GTX670: 154.0 @ 48M 4B items) + using FindPolicy = + AgentFindPolicy, items_per_vec_load, LOAD_LDG>; + + // // SingleTilePolicy + // using SingleTilePolicy = FindPolicy; + }; + + using MaxPolicy = Policy300; +}; + +template > +struct DispatchFind : SelectedPolicy +{ + //--------------------------------------------------------------------------- + // Problem state + //--------------------------------------------------------------------------- + + /// Device-accessible allocation of temporary storage. When `nullptr`, the + /// required allocation size is written to `temp_storage_bytes` and no work + /// is done. + void* d_temp_storage; + + /// Reference to size in bytes of `d_temp_storage` allocation + size_t& temp_storage_bytes; + + /// Pointer to the input sequence of data items + InputIteratorT d_in; + + /// Pointer to the output aggregate + OutputIteratorT d_out; + + /// Total number of input items (i.e., length of `d_in`) + OffsetT num_items; + + /// Unary search functor + ScanOpT scan_op; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + int ptx_version; + + //--------------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------------- + + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchFind( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ScanOpT scan_op, + cudaStream_t stream, + int ptx_version) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_in(d_in) + , d_out(d_out) + , num_items(num_items) + , scan_op(scan_op) + , stream(stream) + , ptx_version(ptx_version) + {} + + //--------------------------------------------------------------------------- + // Normal problem size invocation + //--------------------------------------------------------------------------- + + //--------------------------------------------------------------------------- + // Chained policy invocation + //--------------------------------------------------------------------------- + + /// Invocation + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(FindKernel find_kernel) + { + using Policy = typename ActivePolicyT::FindPolicy; + + cudaError error = cudaSuccess; + do + { + // Number of input tiles + int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; + int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + // Get device ordinal + int device_ordinal; + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { + break; + } + + // Get SM count + int sm_count; + error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + if (cudaSuccess != error) + { + break; + } + + int find_if_sm_occupancy; + error = CubDebug(cub::MaxSmOccupancy(find_if_sm_occupancy, find_kernel, Policy::BLOCK_THREADS)); + if (cudaSuccess != error) + { + break; + } + + int findif_device_occupancy = find_if_sm_occupancy * sm_count; + int max_blocks = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big + int findif_grid_size = CUB_MIN(num_tiles, max_blocks); + + // Temporary storage allocation requirements + void* allocations[1] = {}; + size_t allocation_sizes[1] = {sizeof(int)}; + // Alias the temporary allocations from the single storage blob (or + // compute the necessary size of the blob) + error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } + + OffsetT* value_temp_storage = static_cast(allocations[0]); + + if (d_temp_storage == nullptr) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // use d_temp_storage as the intermediate device result + // to read and write from. Then store the final result in the output iterator. + + cuda_mem_set_async_dtemp_storage<<<1, 1>>>(value_temp_storage, num_items); + + // Invoke FindIfKernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + findif_grid_size, ActivePolicyT::FindPolicy::BLOCK_THREADS, 0, stream) + .doit(find_kernel, d_in, d_out, num_items, value_temp_storage, scan_op); + + write_final_result_in_output_iterator_already<<<1, 1>>>(value_temp_storage, d_out); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + + // Sync the stream if specified to flush runtime errors + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) + { + break; + } + + } while (0); + return error; + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + { + using MaxPolicyT = typename SelectedPolicy::MaxPolicy; + return Invoke( + DeviceFindKernel); // include the surrounding two + // init and write back kernels + // here. + } + + //--------------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------------- + + /** + * @brief @giannis ENTER NO DOCUMENTATION. DISPATCH LAYER IN NEW ALGOS NOT EXPOSED + * + // private: ????? */ + + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ScanOpT scan_op, + cudaStream_t stream) + { + using MaxPolicyT = typename DispatchFind::MaxPolicy; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version = 0; + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + break; + } + // Create dispatch functor + DispatchFind dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, scan_op, stream, ptx_version); + + // Dispatch to chained policy + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); // @giannis how is Invoke() been called since it + // takes no arguments + if (cudaSuccess != error) + { + break; + } + } while (0); + + return error; + } +}; + +CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu index 2f7a3175049..67918021e6a 100644 --- a/cub/test/catch2_test_device_find_if.cu +++ b/cub/test/catch2_test_device_find_if.cu @@ -36,11 +36,8 @@ #include #include -#include - #include "c2h/custom_type.cuh" #include "catch2_test_device_reduce.cuh" -#include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" #include "thrust/detail/raw_pointer_cast.h" #include @@ -57,7 +54,7 @@ using custom_t = c2h::lexicographical_less_comparable_t, c2h::lexicographical_greater_comparable_t>; -using full_type_list = c2h::type_list, type_pair>; +using full_type_list = c2h::type_list, type_pair>; // clang-format on enum class gen_data_t : int @@ -89,15 +86,15 @@ struct equals } }; -CUB_TEST("Device find_if works", "[device]", full_type_list) +C2H_TEST("Device find_if works", "[device]", full_type_list) { using params = params_t; using input_t = typename params::item_t; using output_t = typename params::output_t; - using offset_t = int32_t; + using offset_t = output_t; constexpr offset_t min_items = 1; - constexpr offset_t max_items = 1000000; + constexpr offset_t max_items = std::numeric_limits::max() / 5; // make test run faster // Generate the input sizes to test for const offset_t num_items = GENERATE_COPY( @@ -114,7 +111,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) c2h::device_vector in_items(num_items); if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM) { - c2h::gen(CUB_SEED(2), in_items); + c2h::gen(C2H_SEED(2), in_items); } else { @@ -124,11 +121,11 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) } auto d_in_it = thrust::raw_pointer_cast(in_items.data()); + using op_t = equals; + input_t val_to_find = GENERATE_COPY(take(1, random(min_items, max_items))); + SECTION("Generic find if case") { - using op_t = equals; - input_t val_to_find{2}; - // Prepare verification data c2h::host_vector host_items(in_items); c2h::host_vector expected_result(1); @@ -139,12 +136,12 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) // Run test c2h::device_vector out_result(1); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + output_t* d_out_it = thrust::raw_pointer_cast(out_result.data()); cub::DeviceFind::FindIf( d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items); - thrust::device_vector temp_storage(temp_storage_bytes); + thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceFind::FindIf( @@ -156,9 +153,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) SECTION("find_if works with non raw pointers - .begin() iterator") { - using op_t = equals; - input_t val_to_find{2}; - // Prepare verification data c2h::host_vector host_items(in_items); c2h::host_vector expected_result(1); @@ -173,7 +167,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) cub::DeviceFind::FindIf( d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items); - thrust::device_vector temp_storage(temp_storage_bytes); + thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceFind::FindIf( @@ -189,9 +183,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) { if (num_items - offset > 0) { - using op_t = equals; - input_t val_to_find{2}; - // Prepare verification data c2h::host_vector host_items(in_items); c2h::host_vector expected_result(1); @@ -212,7 +203,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) op_t{val_to_find}, num_items - offset); - thrust::device_vector temp_storage(temp_storage_bytes); + thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceFind::FindIf( @@ -231,9 +222,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) SECTION("find_if works with non primitive iterator") { - using op_t = equals; - input_t val_to_find{2}; - // Prepare verification data auto it = thrust::make_counting_iterator(0); // non-primitive iterator c2h::host_vector expected_result(1); @@ -248,7 +236,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list) cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items); - thrust::device_vector temp_storage(temp_storage_bytes); + thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items); From eaa5b754e1eaef028819656f3123ca89cc38aa87 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 1 Nov 2024 17:28:45 -0700 Subject: [PATCH 12/13] Reviews, cleanup --- cub/cub/agent/agent_find.cuh | 11 ++++------- cub/test/catch2_test_device_find_if.cu | 3 --- thrust/benchmarks/bench/count_if/basic.cu | 3 --- thrust/benchmarks/bench/find_if/basic.cu | 3 --- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh index 80b35ea93a0..b3f377fcbdc 100644 --- a/cub/cub/agent/agent_find.cuh +++ b/cub/cub/agent/agent_find.cuh @@ -187,10 +187,7 @@ struct AgentFind __syncthreads(); - enum - { - NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH - }; + constexpr int NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH; //// vectorized loads begin InputT* d_in_unqualified = const_cast(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); @@ -211,10 +208,10 @@ struct AgentFind for (int i = 0; i < ITEMS_PER_THREAD; ++i) { OffsetT nth_vector_of_thread = i / VECTOR_LOAD_LENGTH; - OffsetT element_in_word = i % VECTOR_LOAD_LENGTH; + OffsetT element_in_vector = i % VECTOR_LOAD_LENGTH; OffsetT vector_of_tile = nth_vector_of_thread * BLOCK_THREADS + threadIdx.x; - OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_word; + OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_vector; if (index < num_items) { @@ -286,7 +283,7 @@ struct AgentFind for (int tile_offset = blockIdx.x * TILE_ITEMS; tile_offset < num_items; tile_offset += TILE_ITEMS * gridDim.x) { // Only one thread reads atomically and propagates it to the - // the rest threads of the block through shared memory + // the other threads of the block through shared memory if (threadIdx.x == 0) { sresult = atomicAdd(value_temp_storage, 0); diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu index 67918021e6a..524f7c04c41 100644 --- a/cub/test/catch2_test_device_find_if.cu +++ b/cub/test/catch2_test_device_find_if.cu @@ -76,9 +76,6 @@ template struct equals { T val; - equals(T _val) - : val(_val) - {} __device__ __host__ bool operator()(T i) { diff --git a/thrust/benchmarks/bench/count_if/basic.cu b/thrust/benchmarks/bench/count_if/basic.cu index b672d789868..981e7c7610e 100644 --- a/thrust/benchmarks/bench/count_if/basic.cu +++ b/thrust/benchmarks/bench/count_if/basic.cu @@ -34,9 +34,6 @@ template struct equals { T val; - equals(T _val) - : val(_val) - {} __device__ __host__ bool operator()(T i) { diff --git a/thrust/benchmarks/bench/find_if/basic.cu b/thrust/benchmarks/bench/find_if/basic.cu index 8a2618e26d5..362619f29e0 100644 --- a/thrust/benchmarks/bench/find_if/basic.cu +++ b/thrust/benchmarks/bench/find_if/basic.cu @@ -34,9 +34,6 @@ template struct equals { T val; - equals(T _val) - : val(_val) - {} __device__ __host__ bool operator()(T i) { From 7c2f2a87281d79a02c2a886e5c2c20f9870cb583 Mon Sep 17 00:00:00 2001 From: Giannis Gonidelis Date: Mon, 11 Nov 2024 00:27:46 -0800 Subject: [PATCH 13/13] CacheModifiedInputIterator accepts const Co-authored-by: Bernhard Manfred Gruber --- cub/cub/agent/agent_find.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh index b3f377fcbdc..3298ad94cdb 100644 --- a/cub/cub/agent/agent_find.cuh +++ b/cub/cub/agent/agent_find.cuh @@ -189,10 +189,10 @@ struct AgentFind constexpr int NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH; //// vectorized loads begin - InputT* d_in_unqualified = const_cast(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); + const InputT* d_in_unqualified = d_in + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); cub::CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); + reinterpret_cast(d_in_unqualified)); InputT input_items[ITEMS_PER_THREAD]; VectorT* vec_items = reinterpret_cast(input_items);