diff --git a/cub/test/catch2_test_memcpy_batched.cu b/cub/test/catch2_test_memcpy_batched.cu new file mode 100644 index 00000000000..b2e5ab19761 --- /dev/null +++ b/cub/test/catch2_test_memcpy_batched.cu @@ -0,0 +1,200 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "catch2_test_launch_helper.h" +#include + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, memcpy_batched); + +/** + * @brief Function object that takes an offset and returns an iterator at the given + * offset relative to a fixed base iterator. + */ +template +struct offset_to_ptr_op +{ + IteratorT base_it; + + template + __host__ __device__ __forceinline__ IteratorT operator()(T offset) const + { + return base_it + offset; + } +}; + +/** + * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the + * sequence of input-buffers. + */ +template +auto get_shuffled_buffer_offsets(const c2h::device_vector& buffer_sizes, c2h::seed_t seed) + -> c2h::device_vector +{ + auto const num_buffers = static_cast(buffer_sizes.size()); + + // We're remapping the i-th buffer to pmt_idxs[i] + c2h::device_vector pmt_idxs(num_buffers); + const auto buffer_index_it = thrust::make_counting_iterator(BufferOffsetT{0}); + thrust::shuffle_copy( + buffer_index_it, + buffer_index_it + num_buffers, + pmt_idxs.begin(), + thrust::default_random_engine(static_cast(seed.get()))); + + c2h::device_vector permuted_offsets(num_buffers); + auto permuted_buffer_sizes_it = thrust::make_permutation_iterator(buffer_sizes.begin(), pmt_idxs.begin()); + thrust::exclusive_scan(permuted_buffer_sizes_it, permuted_buffer_sizes_it + num_buffers, permuted_offsets.begin()); + + c2h::device_vector scatter_idxs(num_buffers); + thrust::scatter(buffer_index_it, buffer_index_it + num_buffers, pmt_idxs.cbegin(), scatter_idxs.begin()); + + // Gather the permuted offsets for shuffled buffer offsets + c2h::device_vector buffer_offsets(num_buffers); + thrust::gather(scatter_idxs.cbegin(), scatter_idxs.cend(), permuted_offsets.cbegin(), buffer_offsets.begin()); + return buffer_offsets; +} + +C2H_TEST("DeviceMemcpy::Batched works", "[memcpy]") +try +{ + using src_ptr_t = const uint8_t*; + using dst_ptr_t = uint8_t*; + + // Type used for indexing into the array of buffers + using buffer_offset_t = uint32_t; + + // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer + using buffer_size_t = uint32_t; + + // Type used for indexing into bytes over *all* the buffers' sizes + using byte_offset_t = uint32_t; + + // Total number of bytes that are targeted to be copied on each run + constexpr buffer_offset_t target_copy_size = 32U << 20; + + // Pairs of [min, max] buffer sizes + auto buffer_size_range = GENERATE_COPY( + table( + {{0, 1}, + {1, 2}, + {0, 32}, + {1, 1024}, + {1, 32 * 1024}, + {128 * 1024, 256 * 1024}, + {target_copy_size, target_copy_size}}), + take(4, + map( + [](const std::vector& chunk) { + int lhs = chunk[0]; + int rhs = chunk[1]; + // Optionally ensure lhs < rhs, for example: + return (lhs < rhs) ? std::make_tuple(lhs, rhs) : std::make_tuple(rhs, lhs); + }, + chunk(2, random(1, 1000000))))); + + const auto min_buffer_size = static_cast(std::get<0>(buffer_size_range)); + const auto max_buffer_size = static_cast(std::get<1>(buffer_size_range)); + const double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0; + const auto num_buffers = static_cast(target_copy_size / average_buffer_size); + + c2h::device_vector d_buffer_sizes(num_buffers); + + // Generate the buffer sizes: Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being + // copied (round down) + c2h::gen(C2H_SEED(2), d_buffer_sizes, min_buffer_size, max_buffer_size); + byte_offset_t num_total_bytes = thrust::reduce(d_buffer_sizes.cbegin(), d_buffer_sizes.cend()); + + // Shuffle input buffer source-offsets + auto d_buffer_src_offsets = get_shuffled_buffer_offsets(d_buffer_sizes, C2H_SEED(1)); + + // Shuffle output buffer destination-offsets + auto d_buffer_dst_offsets = get_shuffled_buffer_offsets(d_buffer_sizes, C2H_SEED(1)); + + // Generate random input data and initialize output data + c2h::device_vector d_in(num_total_bytes); + c2h::device_vector d_out(num_total_bytes, 42); + c2h::gen(C2H_SEED(1), d_in); + + // Prepare host-side input data for verification + c2h::host_vector h_in(d_in); + c2h::host_vector h_out(num_total_bytes); + c2h::host_vector h_buffer_sizes(d_buffer_sizes); + c2h::host_vector h_src_offsets(d_buffer_src_offsets); + c2h::host_vector h_dst_offsets(d_buffer_dst_offsets); + + // Prepare d_buffer_srcs + offset_to_ptr_op src_transform_op{static_cast(thrust::raw_pointer_cast(d_in.data()))}; + thrust::transform_iterator, byte_offset_t*> d_buffer_srcs( + thrust::raw_pointer_cast(d_buffer_src_offsets.data()), src_transform_op); + + // Prepare d_buffer_dsts + offset_to_ptr_op dst_transform_op{static_cast(thrust::raw_pointer_cast(d_out.data()))}; + thrust::transform_iterator, byte_offset_t*> d_buffer_dsts( + thrust::raw_pointer_cast(d_buffer_dst_offsets.data()), dst_transform_op); + + // Invoke device-side algorithm + memcpy_batched(d_buffer_srcs, d_buffer_dsts, d_buffer_sizes.begin(), num_buffers); + + // Prepare CPU-side result for verification + for (buffer_offset_t i = 0; i < num_buffers; i++) + { + std::memcpy(thrust::raw_pointer_cast(h_out.data()) + h_dst_offsets[i], + thrust::raw_pointer_cast(h_in.data()) + h_src_offsets[i], + h_buffer_sizes[i]); + } + + REQUIRE(d_out == h_out); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Caught bad_alloc: " << e.what() << std::endl; +} + +C2H_TEST("DeviceMemcpy::Batched works for a very large buffer", "[memcpy]") +try +{ + using data_t = uint64_t; + using byte_offset_t = uint64_t; + using buffer_size_t = uint64_t; + + byte_offset_t large_target_copy_size = static_cast(std::numeric_limits::max()) + (32 << 20); + constexpr auto data_type_size = static_cast(sizeof(data_t)); + byte_offset_t num_items = large_target_copy_size / data_type_size; + byte_offset_t num_bytes = num_items * data_type_size; + c2h::device_vector d_in(num_items); + c2h::device_vector d_out(num_items, 42); + + auto input_data_it = thrust::make_counting_iterator(data_t{42}); + thrust::copy(input_data_it, input_data_it + num_items, d_in.begin()); + + const auto num_buffers = 1; + auto d_buffer_srcs = thrust::make_constant_iterator(static_cast(thrust::raw_pointer_cast(d_in.data()))); + auto d_buffer_dsts = thrust::make_constant_iterator(static_cast(thrust::raw_pointer_cast(d_out.data()))); + auto d_buffer_sizes = thrust::make_constant_iterator(num_bytes); + memcpy_batched(d_buffer_srcs, d_buffer_dsts, d_buffer_sizes, num_buffers); + + const bool all_equal = thrust::equal(d_out.cbegin(), d_out.cend(), input_data_it); + REQUIRE(all_equal == true); +} +catch (std::bad_alloc& e) +{ + std::cerr << "Caught bad_alloc: " << e.what() << std::endl; +} diff --git a/cub/test/test_device_batch_memcpy.cu b/cub/test/test_device_batch_memcpy.cu deleted file mode 100644 index ee4cabfcf70..00000000000 --- a/cub/test/test_device_batch_memcpy.cu +++ /dev/null @@ -1,433 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "test_util.h" -#include -#include - -/** - * @brief Host-side random data generation - */ -template -void GenerateRandomData( - T* rand_out, - const std::size_t num_items, - const T min_rand_val = ::cuda::std::numeric_limits::min(), - const T max_rand_val = ::cuda::std::numeric_limits::max(), - const std::uint_fast32_t seed = 320981U, - std::enable_if_t && (sizeof(T) >= 2)>* = nullptr) -{ - // initialize random number generator - std::mt19937 rng(seed); - std::uniform_int_distribution uni_dist(min_rand_val, max_rand_val); - - // generate random numbers - for (std::size_t i = 0; i < num_items; ++i) - { - rand_out[i] = uni_dist(rng); - } -} - -template -void __global__ BaselineBatchMemCpyKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferOffsetT num_buffers) -{ - BufferOffsetT gtid = blockDim.x * blockIdx.x + threadIdx.x; - if (gtid >= num_buffers) - { - return; - } - for (BufferOffsetT i = 0; i < buffer_sizes[gtid]; i++) - { - reinterpret_cast(output_buffer_it[gtid])[i] = reinterpret_cast(input_buffer_it[gtid])[i]; - } -} - -template -void InvokeBaselineBatchMemcpy( - InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, uint32_t num_buffers) -{ - constexpr uint32_t block_threads = 128U; - uint32_t num_blocks = (num_buffers + block_threads - 1) / block_threads; - BaselineBatchMemCpyKernel<<>>(input_buffer_it, output_buffer_it, buffer_sizes, num_buffers); -} - -template -void __global__ BaselineBatchMemCpyPerBlockKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferOffsetT num_buffers) -{ - BufferOffsetT gbid = blockIdx.x; - if (gbid >= num_buffers) - { - return; - } - for (BufferOffsetT i = threadIdx.x; i < buffer_sizes[gbid] / 8; i += blockDim.x) - { - reinterpret_cast(output_buffer_it[gbid])[i] = reinterpret_cast(input_buffer_it[gbid])[i]; - } -} - -/** - * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the - * sequence of input-buffers. - */ -template -c2h::host_vector -GetShuffledBufferOffsets(const c2h::host_vector& buffer_sizes, const std::uint_fast32_t seed = 320981U) -{ - BufferOffsetT num_buffers = static_cast(buffer_sizes.size()); - - // We're remapping the i-th buffer to pmt_idxs[i] - std::mt19937 rng(seed); - c2h::host_vector pmt_idxs(num_buffers); - std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast(0)); - std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng); - - // Compute the offsets using the new mapping - ByteOffsetT running_offset = {}; - c2h::host_vector permuted_offsets; - permuted_offsets.reserve(num_buffers); - for (auto permuted_buffer_idx : pmt_idxs) - { - permuted_offsets.push_back(running_offset); - running_offset += buffer_sizes[permuted_buffer_idx]; - } - - // Generate the scatter indexes that identify where each buffer was mapped to - c2h::host_vector scatter_idxs(num_buffers); - for (BufferOffsetT i = 0; i < num_buffers; i++) - { - scatter_idxs[pmt_idxs[i]] = i; - } - - c2h::host_vector new_offsets(num_buffers); - for (BufferOffsetT i = 0; i < num_buffers; i++) - { - new_offsets[i] = permuted_offsets[scatter_idxs[i]]; - } - - return new_offsets; -} - -/** - * @brief Function object class template that takes an offset and returns an iterator at the given - * offset relative to a fixed base iterator. - * - * @tparam IteratorT The random-access iterator type to be returned - */ -template -struct OffsetToPtrOp -{ - template - __host__ __device__ __forceinline__ IteratorT operator()(T offset) const - { - return base_it + offset; - } - IteratorT base_it; -}; - -enum class TestDataGen -{ - // Random offsets into a data segment - RANDOM, - - // Buffers cohesively reside next to each other - CONSECUTIVE -}; - -std::string TestDataGenToString(TestDataGen gen) -{ - switch (gen) - { - case TestDataGen::RANDOM: - return "TestDataGen::RANDOM"; - case TestDataGen::CONSECUTIVE: - return "TestDataGen::CONSECUTIVE"; - default: - return "Unknown"; - } -} - -/** - * @brief - * - * @tparam AtomicT The most granular type being copied. All source and destination pointers will be - * aligned based on this type, the number of bytes being copied will be an integer multiple of this - * type's size - * @tparam BufferOffsetT Type used for indexing into the array of buffers - * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to - * cover the max buffer size) - * @tparam ByteOffsetT Type used for indexing into bytes over *all* the buffers' sizes - */ -template -void RunTest(BufferOffsetT num_buffers, - BufferSizeT min_buffer_size, - BufferSizeT max_buffer_size, - TestDataGen input_gen, - TestDataGen output_gen) -{ - try - { - using SrcPtrT = const uint8_t*; - using DstPtrT = uint8_t*; - - // Buffer segment data (their offsets and sizes) - c2h::host_vector h_buffer_sizes(num_buffers); - c2h::host_vector h_buffer_src_offsets(num_buffers); - c2h::host_vector h_buffer_dst_offsets(num_buffers); - - // Generate the buffer sizes - GenerateRandomData(h_buffer_sizes.data(), h_buffer_sizes.size(), min_buffer_size, max_buffer_size); - - // Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being copied - // (round down) - for (BufferOffsetT i = 0; i < num_buffers; i++) - { - h_buffer_sizes[i] = (h_buffer_sizes[i] / sizeof(AtomicT)) * sizeof(AtomicT); - } - - // Compute the total bytes to be copied - ByteOffsetT num_total_bytes = 0; - for (BufferOffsetT i = 0; i < num_buffers; i++) - { - if (input_gen == TestDataGen::CONSECUTIVE) - { - h_buffer_src_offsets[i] = num_total_bytes; - } - if (output_gen == TestDataGen::CONSECUTIVE) - { - h_buffer_dst_offsets[i] = num_total_bytes; - } - num_total_bytes += h_buffer_sizes[i]; - } - - // Shuffle input buffer source-offsets - std::uint_fast32_t shuffle_seed = 320981U; - if (input_gen == TestDataGen::RANDOM) - { - h_buffer_src_offsets = GetShuffledBufferOffsets(h_buffer_sizes, shuffle_seed); - shuffle_seed += 42; - } - - // Shuffle input buffer source-offsets - if (output_gen == TestDataGen::RANDOM) - { - h_buffer_dst_offsets = GetShuffledBufferOffsets(h_buffer_sizes, shuffle_seed); - } - - // Populate the data source with random data - using RandomInitAliasT = uint16_t; - std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t); - std::size_t num_aliased_units = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor); - - c2h::host_vector h_in(num_aliased_units * num_aliased_factor); - c2h::host_vector h_out(num_total_bytes); - c2h::host_vector h_gpu_results(num_total_bytes); - - // Generate random offsets into the random-bits data buffer - GenerateRandomData(reinterpret_cast(thrust::raw_pointer_cast(h_in.data())), num_aliased_units); - - c2h::device_vector d_in(h_in); - c2h::device_vector d_out(num_total_bytes); - c2h::device_vector d_buffer_src_offsets(h_buffer_src_offsets); - c2h::device_vector d_buffer_dst_offsets(h_buffer_dst_offsets); - c2h::device_vector d_buffer_sizes(h_buffer_sizes); - - // Prepare d_buffer_srcs - OffsetToPtrOp src_transform_op{static_cast(thrust::raw_pointer_cast(d_in.data()))}; - thrust::transform_iterator, ByteOffsetT*> d_buffer_srcs( - thrust::raw_pointer_cast(d_buffer_src_offsets.data()), src_transform_op); - - // Prepare d_buffer_dsts - OffsetToPtrOp dst_transform_op{static_cast(thrust::raw_pointer_cast(d_out.data()))}; - thrust::transform_iterator, ByteOffsetT*> d_buffer_dsts( - thrust::raw_pointer_cast(d_buffer_dst_offsets.data()), dst_transform_op); - - // Get temporary storage requirements - std::size_t temp_storage_bytes = 0; - CubDebugExit(cub::DeviceMemcpy::Batched( - nullptr, temp_storage_bytes, d_buffer_srcs, d_buffer_dsts, d_buffer_sizes.cbegin(), num_buffers)); - - c2h::device_vector d_temp_storage(temp_storage_bytes); - - // Invoke device-side algorithm being under test - CubDebugExit(cub::DeviceMemcpy::Batched( - thrust::raw_pointer_cast(d_temp_storage.data()), - temp_storage_bytes, - d_buffer_srcs, - d_buffer_dsts, - d_buffer_sizes.begin(), - num_buffers)); - - // Copy back the output buffer - CubDebugExit(cudaDeviceSynchronize()); - h_gpu_results = d_out; - - // CPU-side result generation for verification - for (BufferOffsetT i = 0; i < num_buffers; i++) - { - std::memcpy(thrust::raw_pointer_cast(h_out.data()) + h_buffer_dst_offsets[i], - thrust::raw_pointer_cast(h_in.data()) + h_buffer_src_offsets[i], - h_buffer_sizes[i]); - } - - for (ByteOffsetT i = 0; i < num_total_bytes; i++) - { - if (h_gpu_results[i] != h_out[i]) - { - std::cout << "Mismatch at index " << i << ", CPU vs. GPU: " << static_cast(h_gpu_results[i]) << ", " - << static_cast(h_out[i]) << "\n"; - } - AssertEquals(h_out[i], h_gpu_results[i]); - } - } - catch (std::bad_alloc& e) - { - (void) e; -#ifdef DEBUG_CHECKED_ALLOC_FAILURE - std::cout - << "Skipping test 'RunTest(" // - << num_buffers << ", " // - << min_buffer_size << ", " // - << max_buffer_size << ", " // - << TestDataGenToString(input_gen) << ", " // - << TestDataGenToString(output_gen) << ")" // - << "' due to insufficient memory: " << e.what() << "\n"; -#endif // DEBUG_CHECKED_ALLOC_FAILURE - } -} - -int main(int argc, char** argv) -{ - CommandLineArgs args(argc, argv); - - // Initialize device - CubDebugExit(args.DeviceInit()); - - //--------------------------------------------------------------------- - // DeviceMemcpy::Batched tests - //--------------------------------------------------------------------- - // The most granular type being copied. Buffer's will be aligned and their size be an integer - // multiple of this type - using AtomicCopyT = uint8_t; - - // Type used for indexing into the array of buffers - using BufferOffsetT = uint32_t; - - // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer - using BufferSizeT = uint32_t; - - // Type used for indexing into bytes over *all* the buffers' sizes - using ByteOffsetT = uint32_t; - - // Total number of bytes that are targeted to be copied on each run - constexpr BufferOffsetT target_copy_size = 64U << 20; - - // The number of randomly - constexpr std::size_t num_rnd_buffer_range_tests = 32; - - // Each buffer's size will be random within this interval - c2h::host_vector> buffer_size_ranges = { - {0, 1}, - {1, 2}, - {0, 16}, - {1, 32}, - {1, 1024}, - {1, 32 * 1024}, - {128 * 1024, 256 * 1024}, - {target_copy_size, target_copy_size}}; - - std::mt19937 rng(0); - std::uniform_int_distribution size_dist(1, 1000000); - for (std::size_t i = 0; i < num_rnd_buffer_range_tests; i++) - { - auto range_begin = size_dist(rng); - auto range_end = size_dist(rng); - if (range_begin > range_end) - { - std::swap(range_begin, range_end); - } - buffer_size_ranges.push_back({range_begin, range_end}); - } - - for (const auto& buffer_size_range : buffer_size_ranges) - { - BufferSizeT min_buffer_size = - static_cast(CUB_ROUND_UP_NEAREST(buffer_size_range.first, sizeof(AtomicCopyT))); - BufferSizeT max_buffer_size = static_cast( - CUB_ROUND_UP_NEAREST(buffer_size_range.second, static_cast(sizeof(AtomicCopyT)))); - double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0; - BufferOffsetT target_num_buffers = static_cast(target_copy_size / average_buffer_size); - - // Run tests with input buffer being consecutive and output buffers being consecutive - RunTest( - target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE); - - // Run tests with input buffer being randomly shuffled and output buffers being randomly - // shuffled - RunTest( - target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::RANDOM, TestDataGen::RANDOM); - } - - //--------------------------------------------------------------------- - // DeviceMemcpy::Batched test with 64-bit offsets - //--------------------------------------------------------------------- - using ByteOffset64T = uint64_t; - using BufferSize64T = uint64_t; - ByteOffset64T large_target_copy_size = - static_cast(::cuda::std::numeric_limits::max()) + (128ULL * 1024ULL * 1024ULL); - // Make sure min_buffer_size is in fact smaller than max buffer size - constexpr BufferOffsetT single_buffer = 1; - - // Run tests with input buffer being consecutive and output buffers being consecutive - RunTest( - single_buffer, large_target_copy_size, large_target_copy_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE); -}