Skip to content

Commit

Permalink
Rewrites DeviceMemcpy::Batched tests to use device-side data genera…
Browse files Browse the repository at this point in the history
…tion and Catch2 (#3849)

* adds catch2 batched memcpy tests

* simplifies tests

* adds test for memcpy of large buffer

* removes old memcpy test

* drops unused vars

* addresses review comments
  • Loading branch information
elstehle authored Feb 19, 2025
1 parent fd86263 commit b52534b
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 433 deletions.
200 changes: 200 additions & 0 deletions cub/test/catch2_test_memcpy_batched.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <cub/device/device_memcpy.cuh>
#include <cub/util_macro.cuh>

#include <thrust/copy.h>
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/fill.h>
#include <thrust/gather.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
#include <thrust/scatter.h>
#include <thrust/sequence.h>
#include <thrust/shuffle.h>
#include <thrust/transform.h>

#include "catch2_test_launch_helper.h"
#include <c2h/catch2_test_helper.h>

// %PARAM% TEST_LAUNCH lid 0:1:2

DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, memcpy_batched);

/**
* @brief Function object that takes an offset and returns an iterator at the given
* offset relative to a fixed base iterator.
*/
template <typename IteratorT>
struct offset_to_ptr_op
{
IteratorT base_it;

template <typename T>
__host__ __device__ __forceinline__ IteratorT operator()(T offset) const
{
return base_it + offset;
}
};

/**
* @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the
* sequence of input-buffers.
*/
template <typename BufferOffsetT, typename ByteOffsetT, typename BufferSizeT>
auto get_shuffled_buffer_offsets(const c2h::device_vector<BufferSizeT>& buffer_sizes, c2h::seed_t seed)
-> c2h::device_vector<ByteOffsetT>
{
auto const num_buffers = static_cast<BufferOffsetT>(buffer_sizes.size());

// We're remapping the i-th buffer to pmt_idxs[i]
c2h::device_vector<BufferOffsetT> pmt_idxs(num_buffers);
const auto buffer_index_it = thrust::make_counting_iterator(BufferOffsetT{0});
thrust::shuffle_copy(
buffer_index_it,
buffer_index_it + num_buffers,
pmt_idxs.begin(),
thrust::default_random_engine(static_cast<std::uint32_t>(seed.get())));

c2h::device_vector<ByteOffsetT> permuted_offsets(num_buffers);
auto permuted_buffer_sizes_it = thrust::make_permutation_iterator(buffer_sizes.begin(), pmt_idxs.begin());
thrust::exclusive_scan(permuted_buffer_sizes_it, permuted_buffer_sizes_it + num_buffers, permuted_offsets.begin());

c2h::device_vector<BufferOffsetT> scatter_idxs(num_buffers);
thrust::scatter(buffer_index_it, buffer_index_it + num_buffers, pmt_idxs.cbegin(), scatter_idxs.begin());

// Gather the permuted offsets for shuffled buffer offsets
c2h::device_vector<ByteOffsetT> buffer_offsets(num_buffers);
thrust::gather(scatter_idxs.cbegin(), scatter_idxs.cend(), permuted_offsets.cbegin(), buffer_offsets.begin());
return buffer_offsets;
}

C2H_TEST("DeviceMemcpy::Batched works", "[memcpy]")
try
{
using src_ptr_t = const uint8_t*;
using dst_ptr_t = uint8_t*;

// Type used for indexing into the array of buffers
using buffer_offset_t = uint32_t;

// Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer
using buffer_size_t = uint32_t;

// Type used for indexing into bytes over *all* the buffers' sizes
using byte_offset_t = uint32_t;

// Total number of bytes that are targeted to be copied on each run
constexpr buffer_offset_t target_copy_size = 32U << 20;

// Pairs of [min, max] buffer sizes
auto buffer_size_range = GENERATE_COPY(
table<int, int>(
{{0, 1},
{1, 2},
{0, 32},
{1, 1024},
{1, 32 * 1024},
{128 * 1024, 256 * 1024},
{target_copy_size, target_copy_size}}),
take(4,
map(
[](const std::vector<int>& chunk) {
int lhs = chunk[0];
int rhs = chunk[1];
// Optionally ensure lhs < rhs, for example:
return (lhs < rhs) ? std::make_tuple(lhs, rhs) : std::make_tuple(rhs, lhs);
},
chunk(2, random(1, 1000000)))));

const auto min_buffer_size = static_cast<buffer_size_t>(std::get<0>(buffer_size_range));
const auto max_buffer_size = static_cast<buffer_size_t>(std::get<1>(buffer_size_range));
const double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0;
const auto num_buffers = static_cast<buffer_offset_t>(target_copy_size / average_buffer_size);

c2h::device_vector<buffer_size_t> d_buffer_sizes(num_buffers);

// Generate the buffer sizes: Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being
// copied (round down)
c2h::gen(C2H_SEED(2), d_buffer_sizes, min_buffer_size, max_buffer_size);
byte_offset_t num_total_bytes = thrust::reduce(d_buffer_sizes.cbegin(), d_buffer_sizes.cend());

// Shuffle input buffer source-offsets
auto d_buffer_src_offsets = get_shuffled_buffer_offsets<buffer_offset_t, byte_offset_t>(d_buffer_sizes, C2H_SEED(1));

// Shuffle output buffer destination-offsets
auto d_buffer_dst_offsets = get_shuffled_buffer_offsets<buffer_offset_t, byte_offset_t>(d_buffer_sizes, C2H_SEED(1));

// Generate random input data and initialize output data
c2h::device_vector<std::uint8_t> d_in(num_total_bytes);
c2h::device_vector<std::uint8_t> d_out(num_total_bytes, 42);
c2h::gen(C2H_SEED(1), d_in);

// Prepare host-side input data for verification
c2h::host_vector<std::uint8_t> h_in(d_in);
c2h::host_vector<std::uint8_t> h_out(num_total_bytes);
c2h::host_vector<buffer_size_t> h_buffer_sizes(d_buffer_sizes);
c2h::host_vector<byte_offset_t> h_src_offsets(d_buffer_src_offsets);
c2h::host_vector<byte_offset_t> h_dst_offsets(d_buffer_dst_offsets);

// Prepare d_buffer_srcs
offset_to_ptr_op<src_ptr_t> src_transform_op{static_cast<src_ptr_t>(thrust::raw_pointer_cast(d_in.data()))};
thrust::transform_iterator<offset_to_ptr_op<src_ptr_t>, byte_offset_t*> d_buffer_srcs(
thrust::raw_pointer_cast(d_buffer_src_offsets.data()), src_transform_op);

// Prepare d_buffer_dsts
offset_to_ptr_op<dst_ptr_t> dst_transform_op{static_cast<dst_ptr_t>(thrust::raw_pointer_cast(d_out.data()))};
thrust::transform_iterator<offset_to_ptr_op<dst_ptr_t>, byte_offset_t*> d_buffer_dsts(
thrust::raw_pointer_cast(d_buffer_dst_offsets.data()), dst_transform_op);

// Invoke device-side algorithm
memcpy_batched(d_buffer_srcs, d_buffer_dsts, d_buffer_sizes.begin(), num_buffers);

// Prepare CPU-side result for verification
for (buffer_offset_t i = 0; i < num_buffers; i++)
{
std::memcpy(thrust::raw_pointer_cast(h_out.data()) + h_dst_offsets[i],
thrust::raw_pointer_cast(h_in.data()) + h_src_offsets[i],
h_buffer_sizes[i]);
}

REQUIRE(d_out == h_out);
}
catch (std::bad_alloc& e)
{
std::cerr << "Caught bad_alloc: " << e.what() << std::endl;
}

C2H_TEST("DeviceMemcpy::Batched works for a very large buffer", "[memcpy]")
try
{
using data_t = uint64_t;
using byte_offset_t = uint64_t;
using buffer_size_t = uint64_t;

byte_offset_t large_target_copy_size = static_cast<byte_offset_t>(std::numeric_limits<uint32_t>::max()) + (32 << 20);
constexpr auto data_type_size = static_cast<byte_offset_t>(sizeof(data_t));
byte_offset_t num_items = large_target_copy_size / data_type_size;
byte_offset_t num_bytes = num_items * data_type_size;
c2h::device_vector<data_t> d_in(num_items);
c2h::device_vector<data_t> d_out(num_items, 42);

auto input_data_it = thrust::make_counting_iterator(data_t{42});
thrust::copy(input_data_it, input_data_it + num_items, d_in.begin());

const auto num_buffers = 1;
auto d_buffer_srcs = thrust::make_constant_iterator(static_cast<void*>(thrust::raw_pointer_cast(d_in.data())));
auto d_buffer_dsts = thrust::make_constant_iterator(static_cast<void*>(thrust::raw_pointer_cast(d_out.data())));
auto d_buffer_sizes = thrust::make_constant_iterator(num_bytes);
memcpy_batched(d_buffer_srcs, d_buffer_dsts, d_buffer_sizes, num_buffers);

const bool all_equal = thrust::equal(d_out.cbegin(), d_out.cend(), input_data_it);
REQUIRE(all_equal == true);
}
catch (std::bad_alloc& e)
{
std::cerr << "Caught bad_alloc: " << e.what() << std::endl;
}
Loading

0 comments on commit b52534b

Please sign in to comment.