From 8118351195e3e92d8e70fda9fefc58b0bf3a4aea Mon Sep 17 00:00:00 2001 From: Nara Date: Wed, 22 Nov 2023 00:06:32 +0100 Subject: [PATCH] StreamHPC 2023-11-17 (batch memcpy) (#485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Implemented batch memcpy algorithm and relevant tests and benchmarks * Optimize match_any by using arithmetic shifts The compiler seems to see through these much better than the conditional, generating bit-field extract instructions, and recognizing that the loop is a reduction. * Pedantic / consistency changes for batch memcpy * Improve interface and implementation of align_(up|down) - Use the alignment of the destination type instead of its size - Rename to emphasize that this does a form of reinterpret_cast - Use the same type as the return type and template parameter, to match the interface of built-in casts - Pedantic: use uintptr_t instead of size_t for the numerical value of a pointer - Use clangs __builtin_align_(up|down) when available * Take parameters as explicit const-ref in test_utils::bit_equal Because these are templates this already works for non-copyable types, (as `T` will be deduced to `Type&`) but its confusing, and wouldn't work for r-values. Because we are comparing object representations taking a copy isn't okay as that only guarantees that the value representation is copied. (I.e. padding bytes are not required to be copied when taking a parameter by copy) * Actually make custom_non(copyable|moveable)_type non (copy|move)-able * Allow passing rocprim::default_config to batch_memcpy As all the other device functions do too. * Fix typo in cast_align_down documentation * Fixup accidentally deleted constructor of custom_non_moveable_type This was accidentally deleted, it was meant to be defaulted. Currently no test calls this as batch-memcpy tests only use this type at the device side. * Improve error message of test_rocprim_package The error message of the package test wasn't very nice, improve it for easier debugging in the future. Before: ```console ❯ ./a.out 98 ``` After: ```console ❯ ./a.out Error hipErrorInvalidDeviceFunction(98): invalid device function in main at test_rocprim_package.cpp:90 ``` * Refactor test_utils::get_random_data into generate_random_data_n - Writes the output into an output iterator instead of creating & returning a vector. This allows greater flexibility for users i.e. writing random values with differing options into the same container. - Accepts a generator instead of a seed. This is more efficient, because creating an instance of an rng engine might be costly. It's also more consistent with how the standard library operates. - The naming and interface tries to mirror the stl (i.e. `std::generate_n`) - Backwards compatibility is maintained by adding test_utils::get_random_data that uses `generate_random_data_n` internally. * Refactor get_random_data into generate_random_data_n in benchmark_utils This mirrors the test changes in the previous commit * Unify segmnented generation from test generate_random_data_n overloads * Add missing include for iterator traits to benchmark_utils * ci: use build instead rocm-build tag This allows the build job to be performed by any runner configured for building, instead of the ROCm-specialized builder. As the target architectures are specified ahead of time, the GPU is not needed during the build process, and may be performed by any builder. * fix: Fixed doxygen warning in device_memcpy_config.hpp * Speed up / Improve data-generation in test_device_batch_memcpy Do bulk data-generation instead of individual calls, especially of individual bytes for the data to copy. Also changes the verification to do bulk memcmp instead of item-wise test_utils::bit_equals for each buffer. Overall this reduces the time it takes to run the test to ~1s from around 3s. * Refactor & Speedup benchmark_device_batch_memcpy - Share the data generation between the naive and uut benchmarks - Make the data-generation be bulk using a fast random number engine (mt19937) to significantly speed it up. The overall runtime of the benchmark decreased from 14 minutes (!) to around 2 minutes. * Fix explanation comment in batch_memcpy test/benchmark * fix include order in benchmark_device_batch_memcpy * doc: add batch memcpy to changelog --------- Co-authored-by: Gergely Meszaros Co-authored-by: Robin Voetter --- .gitlab-ci.yml | 14 +- CHANGELOG.md | 2 + benchmark/CMakeLists.txt | 1 + benchmark/benchmark_device_batch_memcpy.cpp | 522 ++++++++++ benchmark/benchmark_utils.hpp | 121 ++- rocprim/include/rocprim/detail/various.hpp | 56 +- .../device/detail/device_batch_memcpy.hpp | 947 ++++++++++++++++++ .../device/detail/lookback_scan_state.hpp | 28 + .../include/rocprim/device/device_memcpy.hpp | 346 +++++++ .../rocprim/device/device_memcpy_config.hpp | 81 ++ rocprim/include/rocprim/intrinsics/warp.hpp | 26 +- rocprim/include/rocprim/rocprim.hpp | 5 +- test/extra/test_rocprim_package.cpp | 23 +- test/rocprim/CMakeLists.txt | 1 + test/rocprim/test_device_batch_memcpy.cpp | 329 ++++++ test/rocprim/test_utils_assertions.hpp | 14 +- test/rocprim/test_utils_bfloat16.hpp | 4 +- test/rocprim/test_utils_custom_test_types.hpp | 69 +- test/rocprim/test_utils_data_generation.hpp | 381 +++---- test/rocprim/test_utils_half.hpp | 4 +- 20 files changed, 2666 insertions(+), 308 deletions(-) create mode 100644 benchmark/benchmark_device_batch_memcpy.cpp create mode 100644 rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp create mode 100644 rocprim/include/rocprim/device/device_memcpy.hpp create mode 100644 rocprim/include/rocprim/device/device_memcpy_config.hpp create mode 100644 test/rocprim/test_device_batch_memcpy.cpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0229df12c..66c0092e6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -49,7 +49,7 @@ clang-format: stage: lint needs: [] tags: - - rocm-build + - build variables: CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" @@ -66,7 +66,7 @@ copyright-date: stage: lint needs: [] tags: - - rocm-build + - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: @@ -101,7 +101,7 @@ copyright-date: .build:vcpkg-apt: stage: build tags: - - rocm-build + - build extends: - .gpus:rocm-gpus - .rules:build @@ -157,7 +157,7 @@ build:cmake-minimum-apt: .build:common: stage: build tags: - - rocm-build + - build extends: - .gpus:rocm-gpus - .rules:build @@ -207,7 +207,7 @@ build:package: stage: build needs: [] tags: - - rocm-build + - build extends: - .cmake-minimum - .gpus:rocm-gpus @@ -232,7 +232,7 @@ build:benchmark: stage: build needs: [] tags: - - rocm-build + - build extends: - .cmake-minimum - .gpus:rocm-gpus @@ -264,7 +264,7 @@ autotune:build: stage: autotune needs: [] tags: - - rocm-build + - build extends: - .cmake-minimum - .gpus:rocm-gpus diff --git a/CHANGELOG.md b/CHANGELOG.md index 693593bc7..bca4c4f82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,14 @@ Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/e ## [Unreleased rocPRIM-3.0.0 for ROCm 6.1.0] ### Added - Added new primitive: `block_run_length_decode`. + - Added new primitive: `batch_memcpy`. ### Changed - Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`. - Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`. - Removed support for custom config types for device algorithms. - `host_warp_size()` was moved into `rocprim/device/config_types.hpp`, and now uses either a `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter. The return type is `hipError_t`. - Added support for __int128_t in `device_radix_sort` and `block_radix_sort`. +- Improved the performance of `match_any` and `block_histogram` that uses it ### Fixed - Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8087b43ff..499291a39 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -128,6 +128,7 @@ add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) +add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp new file mode 100644 index 000000000..fde851cdd --- /dev/null +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -0,0 +1,522 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark/benchmark.h" +#include "benchmark_utils.hpp" +#include "cmdparser.hpp" + +#include + +#include + +#include +#include +#include +#include +#include +#include + +constexpr uint32_t warmup_size = 5; +constexpr int32_t max_size = 1024 * 1024; +constexpr int32_t wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold; +constexpr int32_t blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold; + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const auto n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + for(T i = 0, sum = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +using offset_type = size_t; + +template +struct BatchMemcpyData +{ + size_t total_num_elements = 0; + ValueType* d_input = nullptr; + ValueType* d_output = nullptr; + ValueType** d_buffer_srcs = nullptr; + ValueType** d_buffer_dsts = nullptr; + BufferSizeType* d_buffer_sizes = nullptr; + + BatchMemcpyData() = default; + BatchMemcpyData(const BatchMemcpyData&) = delete; + + BatchMemcpyData(BatchMemcpyData&& other) + : total_num_elements{std::exchange(other.total_num_elements, 0)} + , d_input{std::exchange(other.d_input, nullptr)} + , d_output{std::exchange(other.d_output, nullptr)} + , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} + , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} + , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} + {} + + BatchMemcpyData& operator=(BatchMemcpyData&& other) + { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; + + size_t total_num_bytes() const + { + return total_num_elements * sizeof(ValueType); + } + + ~BatchMemcpyData() + { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } +}; + +template +BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const bool shuffle_buffers = false; + + BatchMemcpyData result; + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems + = rocprim::detail::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems + = rocprim::detail::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng); + iter = generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); + iter = generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); + } + + result.total_num_elements + = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints + = rocprim::detail::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffle_buffers) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } + else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK( + hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; +} + +template +void run_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchMemcpyData data; + HIP_CHECK(rocprim::batch_memcpy(nullptr, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(rocprim::batch_memcpy(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(rocprim::batch_memcpy(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); +} + +// Naive implementation used for comparison +#define BENCHMARK_BATCH_MEMCPY_NAIVE +#ifdef BENCHMARK_BATCH_MEMCPY_NAIVE + +template +__launch_bounds__(BlockSize) __global__ + void naive_kernel(void** in_ptr, void** out_ptr, const OffsetType* sizes) +{ + using underlying_type = unsigned char; + constexpr int32_t items_per_thread = 4; + constexpr int32_t tile_size = items_per_thread * BlockSize; + + const int32_t buffer_id = rocprim::flat_block_id(); + auto in = reinterpret_cast(in_ptr[buffer_id]); + auto out = reinterpret_cast(out_ptr[buffer_id]); + + const auto size = sizes[buffer_id]; + const auto size_in_elements = size / sizeof(underlying_type); + const auto tiles = size_in_elements / tile_size; + + auto num_items_to_copy = size; + + for(size_t i = 0; i < tiles; ++i) + { + underlying_type data[items_per_thread]; + rocprim::block_load_direct_blocked(rocprim::flat_block_thread_id(), + in, + data, + num_items_to_copy); + rocprim::block_store_direct_blocked(rocprim::flat_block_thread_id(), + out, + data, + num_items_to_copy); + + in += tile_size; + out += tile_size; + num_items_to_copy -= tile_size; + } +} + +template +void run_naive_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + const auto data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + naive_kernel + <<>>((void**)data.d_buffer_srcs, + (void**)data.d_buffer_dsts, + data.d_buffer_sizes); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + naive_kernel + <<>>((void**)data.d_buffer_srcs, + (void**)data.d_buffer_dsts, + data.d_buffer_sizes); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); +} + + #define CREATE_NAIVE_BENCHMARK(item_size, \ + item_alignment, \ + size_type, \ + num_tlev, \ + num_wlev, \ + num_blev) \ + benchmark::RegisterBenchmark( \ + bench_naming::format_name( \ + "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ + ",size_type:" #size_type ",algo:naive_memcpy,num_tlev:" #num_tlev \ + ",num_wlev:" #num_wlev ",num_blev:" #num_blev ",cfg:default_config}") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_naive_benchmark, size_type>( \ + state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ + }) + +#endif + +#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + bench_naming::format_name("{lvl:device,item_size:" #item_size \ + ",item_alignment:" #item_alignment ",size_type:" #size_type \ + ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ + ",num_blev:" #num_blev ",cfg:default_config}") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_benchmark, size_type>(state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ + }) + +#ifndef BENCHMARK_BATCH_MEMCPY_NAIVE + #define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) +#else + #define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000), \ + CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) +#endif + +int32_t main(int32_t argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + bench_naming::set_format(parser.get("name_format")); + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + add_common_benchmark_info(); + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), + BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), + BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), + BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 2d0c578f4..fe6b9a077 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -115,15 +116,20 @@ struct is_valid_for_int_distribution : std::is_same::value > {}; +template +using it_value_t = typename std::iterator_traits::value_type; + using engine_type = std::default_random_engine; -// get_random_data() generates only part of sequence and replicates it, +// generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n( + OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) + -> typename std::enable_if_t>::value, OutputIter> { - engine_type gen{std::random_device{}()}; + using T = it_value_t; + using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, @@ -132,36 +138,35 @@ inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = unsigned int>::type >::type; std::uniform_int_distribution distribution((T)min, (T)max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(it, std::min(size - i, max_random_size), it + i); } - return data; + return it + size; } -template -inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + U min, + V max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value, + OutputIterator> { - engine_type gen{std::random_device{}()}; + using T = typename std::iterator_traits::value_type; + // Generate floats when T is half - using dis_type = typename std::conditional::value, float, T>::type; + using dis_type = std::conditional_t::value, float, T>; std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(it, std::min(size - i, max_random_size), it + i); } - return data; + return it + size; } template @@ -184,7 +189,10 @@ inline std::vector get_random_data01(size_t size, float p, size_t max_random_ template inline T get_random_value(T min, T max) { - return get_random_data(1, min, max)[0]; + T result; + engine_type gen{std::random_device{}()}; + generate_random_data_n(&result, 1, min, max, gen); + return result; } template @@ -231,33 +239,62 @@ struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + it_value_t min, + it_value_t max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value, OutputIterator> { + using T = it_value_t; + using first_type = typename T::first_type; using second_type = typename T::second_type; - std::vector data(size); - auto fdata = get_random_data(size, min.x, max.x, max_random_size); - auto sdata = get_random_data(size, min.y, max.y, max_random_size); + + std::vector fdata(size); + std::vector sdata(size); + generate_random_data_n(fdata.begin(), size, min.x, max.x, gen, max_random_size); + generate_random_data_n(sdata.begin(), size, min.y, max.y, gen, max_random_size); + for(size_t i = 0; i < size; i++) { - data[i] = T(fdata[i], sdata[i]); + it[i] = T(fdata[i], sdata[i]); } - return data; + return it + size; } -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value && !std::is_same::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + it_value_t min, + it_value_t max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value + && !std::is_same::value, + OutputIterator> { + using T = it_value_t; + using field_type = decltype(max.x); - std::vector data(size); - auto field_data = get_random_data(size, min.x, max.x, max_random_size); + std::vector field_data(size); + generate_random_data_n(field_data.begin(), size, min.x, max.x, gen, max_random_size); for(size_t i = 0; i < size; i++) { - data[i] = T(field_data[i]); + it[i] = T(field_data[i]); } + return it + size; +} + +template +inline std::vector + get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) +{ + std::vector data(size); + engine_type gen{std::random_device{}()}; + generate_random_data_n(data.begin(), size, min, max, gen, max_random_size); return data; } @@ -803,4 +840,10 @@ inline const char* get_block_scan_method_name(rocprim::block_scan_algorithm alg) return "unknown_algorithm"; } +template +struct alignas(Alignment) custom_aligned_type +{ + unsigned char data[Size]; +}; + #endif // ROCPRIM_BENCHMARK_UTILS_HPP_ diff --git a/rocprim/include/rocprim/detail/various.hpp b/rocprim/include/rocprim/detail/various.hpp index d82e2bce0..48203e672 100644 --- a/rocprim/include/rocprim/detail/various.hpp +++ b/rocprim/include/rocprim/detail/various.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,8 +24,9 @@ #include #include "../config.hpp" -#include "../types.hpp" +#include "../functional.hpp" #include "../type_traits.hpp" +#include "../types.hpp" #include @@ -34,6 +35,11 @@ #define __has_include(x) 0 #endif +// Check for builtins (clang-extension) and fallback +#ifndef __has_builtin + #define __has_builtin(X) 0 +#endif + #if __has_include() // version is only mandated in c++20 #include #if __cpp_lib_as_const >= 201510L @@ -338,6 +344,52 @@ ROCPRIM_HOST_DEVICE inline void for_each_in_tuple(::rocprim::tuple& t, for_each_in_tuple_impl(t, f, ::rocprim::index_sequence_for()); } +/// \brief Reinterprets the pointer as another type and increments it to match the alignment of +/// the new type. +/// +/// \tparam DstPtr Destination Type to align to +/// \tparam Src Type of source pointer +/// \param pointer The pointer to align +/// \return Aligned pointer +template +ROCPRIM_HOST_DEVICE ROCPRIM_INLINE DstPtr cast_align_up(Src* pointer) +{ + static_assert(std::is_pointer::value, "DstPtr must be a pointer type"); + using Dst = std::remove_pointer_t; +#if __has_builtin(__builtin_align_up) + return reinterpret_cast(__builtin_align_up(pointer, alignof(Dst))); +#else + // https://github.com/KabukiStarship/KabukiToolkit/wiki/Fastest-Method-to-Align-Pointers + constexpr size_t mask = alignof(Dst) - 1; + auto value = reinterpret_cast(pointer); + value += (-value) & mask; + return reinterpret_cast(value); +#endif +} + +/// \brief Reinterprets the pointer as another type and decrements it to match the alignment of +/// the new type. +/// +/// \tparam Ptr Destination Type to align to +/// \tparam Src Type of source pointer +/// \param pointer The pointer to align +/// \return Aligned pointer +template +ROCPRIM_HOST_DEVICE ROCPRIM_INLINE DstPtr cast_align_down(Src* pointer) +{ + static_assert(std::is_pointer::value, "DstPtr must be a pointer type"); + using Dst = std::remove_pointer_t; +#if __has_builtin(__builtin_align_down) + return reinterpret_cast(__builtin_align_down(pointer, alignof(Dst))); +#else + // https://github.com/KabukiStarship/KabukiToolkit/wiki/Fastest-Method-to-Align-Pointers + constexpr size_t mask = ~(alignof(Dst) - 1); + auto value = reinterpret_cast(pointer); + value &= mask; + return reinterpret_cast(value); +#endif +} + } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp new file mode 100644 index 000000000..9c73c9c12 --- /dev/null +++ b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp @@ -0,0 +1,947 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_BATCH_MEMCPY_HPP_ +#define ROCPRIM_DEVICE_DETAIL_DEVICE_BATCH_MEMCPY_HPP_ + +#include "rocprim/device/config_types.hpp" +#include "rocprim/device/detail/device_scan_common.hpp" +#include "rocprim/device/detail/lookback_scan_state.hpp" +#include "rocprim/device/device_scan.hpp" + +#include "rocprim/block/block_exchange.hpp" +#include "rocprim/block/block_load.hpp" +#include "rocprim/block/block_load_func.hpp" +#include "rocprim/block/block_run_length_decode.hpp" +#include "rocprim/block/block_scan.hpp" +#include "rocprim/block/block_store.hpp" +#include "rocprim/block/block_store_func.hpp" + +#include "rocprim/thread/thread_load.hpp" +#include "rocprim/thread/thread_search.hpp" +#include "rocprim/thread/thread_store.hpp" + +#include "rocprim/detail/temp_storage.hpp" +#include "rocprim/detail/various.hpp" +#include "rocprim/functional.hpp" +#include "rocprim/intrinsics.hpp" +#include "rocprim/intrinsics/thread.hpp" + +#include "rocprim/config.hpp" + +#include + +#include + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ +namespace batch_memcpy +{ +enum class size_class +{ + tlev = 0, + wlev = 1, + blev = 2, + num_size_classes, +}; + +template +struct counter +{ +private: + static constexpr int32_t num_items = static_cast(size_class::num_size_classes); + BackingUnitType data[num_items]; + +public: + ROCPRIM_DEVICE ROCPRIM_INLINE uint32_t get(size_class index) const + { + return data[static_cast(index)]; + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void add(size_class index, uint32_t value) + { + data[static_cast(index)] += value; + } + + ROCPRIM_DEVICE counter operator+(const counter& other) const + { + counter result{}; + +#pragma unroll + for(uint32_t i = 0; i < num_items; ++i) + { + result.data[i] = data[i] + other.data[i]; + } + + return result; + } +}; + +template +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static uint8_t read_byte(void* buffer_src, Offset offset) +{ + return rocprim::thread_load( + reinterpret_cast(buffer_src) + offset); +} + +template +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void + write_byte(void* buffer_dst, Offset offset, uint8_t value) +{ + rocprim::thread_store( + reinterpret_cast(buffer_dst) + offset, + value); +} + +template +struct aligned_ranges +{ + VectorType* out_begin; + VectorType* out_end; + + const uint8_t* in_begin; + const uint8_t* in_end; +}; + +/// \brief Gives a a pair of ranges (in_* and out_*) that are contained in in_begin and +/// out_begin of a given size such that the returned out range aligns with the given vector +/// type. +template +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static aligned_ranges + get_aligned_ranges(const void* in_begin, void* out_begin, size_t num_bytes) +{ + uint8_t* out_ptr = static_cast(out_begin); + const uint8_t* in_ptr = static_cast(in_begin); + + auto* out_aligned_begin = detail::cast_align_up(out_ptr); + auto* out_aligned_end = detail::cast_align_down(out_ptr + num_bytes); + + auto begin_offset = reinterpret_cast(out_aligned_begin) - out_ptr; + auto end_offset = reinterpret_cast(out_aligned_end) - out_ptr; + const uint8_t* in_aligned_begin = in_ptr + begin_offset; + const uint8_t* in_aligned_end = in_ptr + end_offset; + + return aligned_ranges{out_aligned_begin, + out_aligned_end, + in_aligned_begin, + in_aligned_end}; +} + +template +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static T funnel_shift_r(T lo, T hi, S shift) +{ + constexpr uint32_t bit_size = sizeof(T) * 8; + return (hi << (bit_size - shift)) | lo >> shift; +} + +template +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void* input_buffer, + void* output_buffer, + Offset num_bytes, + Offset offset = 0) +{ + using vector_type = uint4; + constexpr uint32_t ints_in_vector_type = sizeof(uint4) / sizeof(uint32_t); + + constexpr auto warp_size = rocprim::device_warp_size(); + const auto rank = rocprim::detail::block_thread_id<0>() % warp_size; + + const uint8_t* src = reinterpret_cast(input_buffer) + offset; + uint8_t* dst = reinterpret_cast(output_buffer) + offset; + + const uint8_t* in_ptr = src; + uint8_t* out_ptr = dst; + + const auto aligned = batch_memcpy::get_aligned_ranges(src, dst, num_bytes); + + // If no aligned range, copy byte-by-byte and early exit + if(aligned.out_end <= aligned.out_begin) + { + for(uint32_t i = rank; i < num_bytes; i += warp_size) + { + out_ptr[i] = in_ptr[i]; + } + return; + } + + out_ptr += rank; + in_ptr += rank; + + // Ensure that all pointers are in aligned range + while(out_ptr < reinterpret_cast(aligned.out_begin)) + { + *out_ptr = *in_ptr; + out_ptr += warp_size; + in_ptr += warp_size; + } + + // This can be outside the while block since 'warp_size % ints_in_vector_type' always is '0' + static_assert(warp_size % ints_in_vector_type == 0, "Warp size is not a multiple of 4"); + + in_ptr = aligned.in_begin + rank * sizeof(vector_type); + const uint32_t in_offset = (reinterpret_cast(in_ptr) % ints_in_vector_type); + vector_type* aligned_out_ptr = aligned.out_begin + rank; + const uint32_t* aligned_in_ptr = reinterpret_cast(in_ptr - in_offset); + + // Copy elements in aligned range + if(in_offset == 0) + { + // No offset, can do cacheline-aligned to cacheline-aligned copy + while(aligned_out_ptr < aligned.out_end) + { + vector_type data = vector_type{aligned_in_ptr[0], + aligned_in_ptr[1], + aligned_in_ptr[2], + aligned_in_ptr[3]}; + *aligned_out_ptr = data; + aligned_in_ptr += warp_size * sizeof(vector_type) / sizeof(uint32_t); + aligned_out_ptr += warp_size; + } + } + else + { + while(aligned_out_ptr < aligned.out_end) + { + union + { + vector_type result; + uint32_t bytes[5]; + } data; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < 5; ++i) + { + data.bytes[i] = aligned_in_ptr[i]; + } + + // Reads are offset to our cache aligned writes so we need to shift bytes over. + // AMD has no intrinsic for funner shift, hence the manual implementation. + // Perhaps a better cacheline-aligned to byte-aligned copy method can be used here. + const uint32_t shift = in_offset * 8 /* bits per byte */; + data.result.x = funnel_shift_r(data.bytes[0], data.bytes[1], shift); + data.result.y = funnel_shift_r(data.bytes[1], data.bytes[2], shift); + data.result.z = funnel_shift_r(data.bytes[2], data.bytes[3], shift); + data.result.w = funnel_shift_r(data.bytes[3], data.bytes[4], shift); + + *aligned_out_ptr = data.result; + aligned_in_ptr += warp_size * sizeof(vector_type) / sizeof(uint32_t); + aligned_out_ptr += warp_size; + } + } + + out_ptr = reinterpret_cast(aligned.out_end) + rank; + in_ptr = aligned.in_end + rank; + + // Copy non-aligned tail + while(out_ptr < dst + num_bytes) + { + *out_ptr = *in_ptr; + out_ptr += warp_size; + in_ptr += warp_size; + } +} +} // namespace batch_memcpy + +template +struct batch_memcpy_impl +{ + using input_buffer_type = typename std::iterator_traits::value_type; + using output_buffer_type = typename std::iterator_traits::value_type; + using buffer_size_type = typename std::iterator_traits::value_type; + + using input_type = typename std::iterator_traits::value_type; + + // top level policy + static constexpr uint32_t block_size = Config::non_blev_block_size; + static constexpr uint32_t buffers_per_thread = Config::non_blev_buffers_per_thread; + static constexpr uint32_t tlev_bytes_per_thread = Config::tlev_bytes_per_thread; + + static constexpr uint32_t blev_block_size = Config::blev_block_size; + static constexpr uint32_t blev_bytes_per_thread = Config::blev_bytes_per_thread; + + static constexpr uint32_t wlev_size_threshold = Config::wlev_size_threshold; + static constexpr uint32_t blev_size_threshold = Config::blev_size_threshold; + + static constexpr uint32_t tlev_buffers_per_thread = buffers_per_thread; + static constexpr uint32_t blev_buffers_per_thread = buffers_per_thread; + + static constexpr uint32_t buffers_per_block = buffers_per_thread * block_size; + + // Offset over buffers. + using buffer_offset_type = uint32_t; + + // Offset over tiles. + using tile_offset_type = uint32_t; + + // The byte offset within a thread-level buffer. Must fit at least `wlev_size_threshold`. + static_assert(wlev_size_threshold < std::numeric_limits::max(), + "wlev_size_threshhold too large (should fit in 16 bits)"); + using tlev_byte_offset_type = + typename std::conditional<(wlev_size_threshold < 256), uint8_t, uint16_t>::type; + + struct copyable_buffers + { + InputBufferItType srcs; + OutputBufferItType dsts; + BufferSizeItType sizes; + }; + + struct copyable_blev_buffers + { + InputBufferItType srcs; + OutputBufferItType dsts; + BufferSizeItType sizes; + tile_offset_type* offsets; + }; + +private: + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static batch_memcpy::size_class + get_size_class(buffer_size_type size) + { + auto size_class = batch_memcpy::size_class::tlev; + size_class = size > wlev_size_threshold ? batch_memcpy::size_class::wlev : size_class; + size_class = size > blev_size_threshold ? batch_memcpy::size_class::blev : size_class; + return size_class; + } + + struct zipped_tlev_byte_assignment + { + buffer_offset_type tile_buffer_id; + tlev_byte_offset_type buffer_byte_offset; + }; + + struct buffer_tuple + { + tlev_byte_offset_type size; + buffer_offset_type buffer_id; + }; + + using size_class_counter = batch_memcpy::counter; + + using blev_buffer_scan_state_type = rocprim::detail::lookback_scan_state; + using blev_block_scan_state_type = rocprim::detail::lookback_scan_state; + + using block_size_scan_type = rocprim::block_scan; + using block_blev_tile_count_scan_type = rocprim::block_scan; + + using block_run_length_decode_type = rocprim::block_run_length_decode; + + using block_exchange_tlev_type + = rocprim::block_exchange; + + using buffer_load_type = rocprim::block_load; + + using blev_buffer_scan_prefix_callback_type + = rocprim::detail::offset_lookback_scan_prefix_op>; + + using blev_block_scan_prefix_callback_type + = rocprim::detail::offset_lookback_scan_prefix_op>; + + struct non_blev_memcpy + { + struct storage + { + buffer_tuple buffers_by_size_class[buffers_per_block]; + + // This value is passed from analysis to prepare_blev. + buffer_offset_type blev_buffer_offset; + + union shared_t + { + union analysis_t + { + typename buffer_load_type::storage_type load_storage; + typename block_size_scan_type::storage_type size_scan_storage; + typename blev_buffer_scan_prefix_callback_type::storage_type + buffer_scan_callback; + } analysis; + + struct prepare_blev_t + { + typename blev_block_scan_prefix_callback_type::storage_type block_scan_callback; + typename block_blev_tile_count_scan_type::storage_type block_scan_storage; + } prepare_blev; + + struct copy_tlev_t + { + typename block_run_length_decode_type::storage_type rld_storage; + typename block_exchange_tlev_type::storage_type block_exchange_storage; + } copy_tlev; + } shared; + }; + + using storage_type = rocprim::detail::raw_storage; + + ROCPRIM_DEVICE ROCPRIM_INLINE non_blev_memcpy() {} + + ROCPRIM_DEVICE ROCPRIM_INLINE static size_class_counter get_buffer_size_class_histogram( + const buffer_size_type (&buffer_sizes)[buffers_per_thread]) + { + size_class_counter counters{}; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < buffers_per_thread; ++i) + { + auto size_class = get_size_class(buffer_sizes[i]); + counters.add(size_class, buffer_sizes[i] > 0 ? 1 : 0); + } + return counters; + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void + partition_buffer_by_size(const buffer_size_type (&buffer_sizes)[buffers_per_thread], + size_class_counter counters, + buffer_tuple (&buffers_by_size_class)[buffers_per_block]) + { + const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>(); + + buffer_offset_type buffer_id = flat_block_thread_id; + constexpr buffer_offset_type buffer_stride = block_size; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < buffers_per_thread; ++i, buffer_id += buffer_stride) + { + if(buffer_sizes[i] <= 0) + { + continue; + } + + const auto size_class = get_size_class(buffer_sizes[i]); + const uint32_t write_offset = counters.get(size_class); + buffers_by_size_class[write_offset] + = buffer_tuple{static_cast(buffer_sizes[i]), buffer_id}; + + counters.add(size_class, 1); + } + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void + prepare_blev_buffers(typename storage::shared_t::prepare_blev_t& blev_storage, + buffer_tuple* buffer_by_size_class, + copyable_buffers buffers, + buffer_offset_type num_blev_buffers, + copyable_blev_buffers blev_buffers, + buffer_offset_type tile_buffer_offset, + blev_block_scan_state_type blev_block_scan_state, + buffer_offset_type tile_id) + { + const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>(); + + tile_offset_type tile_offsets[blev_buffers_per_thread]; + auto blev_buffer_offset = flat_block_thread_id * blev_buffers_per_thread; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < blev_buffers_per_thread; ++i) + { + if(blev_buffer_offset < num_blev_buffers) + { + auto tile_buffer_id = buffer_by_size_class[blev_buffer_offset].buffer_id; + tile_offsets[i] + = rocprim::detail::ceiling_div(buffers.sizes[tile_buffer_id], + blev_block_size * blev_bytes_per_thread); + } + else + { + tile_offsets[i] = 0; + } + ++blev_buffer_offset; + } + + // Convert tile counts into tile offsets. + if(tile_id == 0) + { + tile_offset_type tile_aggregate{}; + block_blev_tile_count_scan_type{}.exclusive_scan(tile_offsets, + tile_offsets, + tile_aggregate, + tile_aggregate, + blev_storage.block_scan_storage, + rocprim::plus{}); + if(flat_block_thread_id == 0) + { + blev_block_scan_state.set_complete(0, tile_aggregate); + } + } + else + { + blev_block_scan_prefix_callback_type blev_tile_prefix_op{ + tile_id, + blev_block_scan_state, + blev_storage.block_scan_callback, + rocprim::plus{}}; + block_blev_tile_count_scan_type{}.exclusive_scan(tile_offsets, + tile_offsets, + blev_storage.block_scan_storage, + blev_tile_prefix_op, + rocprim::plus{}); + } + rocprim::syncthreads(); + + blev_buffer_offset = flat_block_thread_id * blev_buffers_per_thread; + + // For each buffer this thread processes... + ROCPRIM_UNROLL + for(uint32_t i = 0; i < blev_buffers_per_thread; ++i, ++blev_buffer_offset) + { + if(blev_buffer_offset >= num_blev_buffers) + { + continue; + } + + // If this thread has any blev buffers to process... + const auto tile_buffer_id = buffer_by_size_class[blev_buffer_offset].buffer_id; + const auto blev_index = tile_buffer_offset + blev_buffer_offset; + + blev_buffers.srcs[blev_index] = buffers.srcs[tile_buffer_id]; + blev_buffers.dsts[blev_index] = buffers.dsts[tile_buffer_id]; + blev_buffers.sizes[blev_index] = buffers.sizes[tile_buffer_id]; + blev_buffers.offsets[blev_index] = tile_offsets[i]; + } + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void copy_wlev_buffers(buffer_tuple* buffers_by_size_class, + copyable_buffers tile_buffers, + buffer_offset_type num_wlev_buffers) + { + const uint32_t warp_id = rocprim::warp_id(); + const uint32_t warps_per_block + = rocprim::flat_block_size() / rocprim::device_warp_size(); + + for(buffer_offset_type buffer_offset = warp_id; buffer_offset < num_wlev_buffers; + buffer_offset += warps_per_block) + { + const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id; + + batch_memcpy::vectorized_copy_bytes(tile_buffers.srcs[buffer_id], + tile_buffers.dsts[buffer_id], + tile_buffers.sizes[buffer_id]); + } + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void + copy_tlev_buffers(typename storage::shared_t::copy_tlev_t& tlev_storage, + buffer_tuple* buffers_by_size_class, + copyable_buffers tile_buffers, + buffer_offset_type num_tlev_buffers) + { + const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>(); + + buffer_offset_type tlev_buffer_ids[tlev_buffers_per_thread]; + tlev_byte_offset_type tlev_buffer_sizes[tlev_buffers_per_thread]; + + static_assert( + tlev_buffers_per_thread >= buffers_per_thread, + "Unsupported configuration: The number of 'thread-level buffers' must be at " + "least as large as the number of overall buffers being processed by each " + "thread."); + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < tlev_buffers_per_thread; ++i) + { + tlev_buffer_sizes[i] = 0; + } + + uint32_t tlev_buffer_offset = flat_block_thread_id * tlev_buffers_per_thread; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < tlev_buffers_per_thread; ++i) + { + if(tlev_buffer_offset < num_tlev_buffers) + { + const auto buffer_info = buffers_by_size_class[tlev_buffer_offset]; + + tlev_buffer_ids[i] = buffer_info.buffer_id; + tlev_buffer_sizes[i] = buffer_info.size; + } + ++tlev_buffer_offset; + } + + // Total number of bytes in this block. + uint32_t num_total_tlev_bytes = 0; + + block_run_length_decode_type block_run_length_decode{tlev_storage.rld_storage, + tlev_buffer_ids, + tlev_buffer_sizes, + num_total_tlev_bytes}; + + // Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated + // until we were able to cover all the bytes of TLEV buffers + uint32_t decoded_window_offset = 0; + while(decoded_window_offset < num_total_tlev_bytes) + { + buffer_offset_type buffer_id[tlev_bytes_per_thread]; + tlev_byte_offset_type buffer_byte_offset[tlev_bytes_per_thread]; + + // Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and + // buffer_byte_offset[i] that buffer's byte that this thread supposed to copy + block_run_length_decode.run_length_decode(buffer_id, + buffer_byte_offset, + decoded_window_offset); + + // Zip from SoA to AoS + zipped_tlev_byte_assignment zipped_byte_assignment[tlev_bytes_per_thread]; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i) + { + zipped_byte_assignment[i] + = zipped_tlev_byte_assignment{buffer_id[i], buffer_byte_offset[i]}; + } + + // Exchange from blocked to striped arrangement for coalesced memory reads and writes + block_exchange_tlev_type{}.blocked_to_striped(zipped_byte_assignment, + zipped_byte_assignment, + tlev_storage.block_exchange_storage); + + // Read in the bytes that this thread is assigned to + constexpr auto window_size = tlev_bytes_per_thread * block_size; + + const bool is_full_window + = decoded_window_offset + window_size < num_total_tlev_bytes; + + if(is_full_window) + { + uint8_t src_byte[tlev_bytes_per_thread]; + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i) + { + src_byte[i] = batch_memcpy::read_byte( + tile_buffers.srcs[zipped_byte_assignment[i].tile_buffer_id], + zipped_byte_assignment[i].buffer_byte_offset); + } + + ROCPRIM_UNROLL + for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i) + { + batch_memcpy::write_byte( + tile_buffers.dsts[zipped_byte_assignment[i].tile_buffer_id], + zipped_byte_assignment[i].buffer_byte_offset, + src_byte[i]); + } + } + else + { + // Read in the bytes that this thread is assigned to + uint32_t absolute_tlev_byte_offset + = decoded_window_offset + flat_block_thread_id; + for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i) + { + if(absolute_tlev_byte_offset < num_total_tlev_bytes) + { + const auto buffer_id = zipped_byte_assignment[i].tile_buffer_id; + const auto buffer_offset = zipped_byte_assignment[i].buffer_byte_offset; + + const auto src_byte + = batch_memcpy::read_byte(tile_buffers.srcs[buffer_id], + buffer_offset); + batch_memcpy::write_byte(tile_buffers.dsts[buffer_id], + buffer_offset, + src_byte); + } + absolute_tlev_byte_offset += block_size; + } + } + + decoded_window_offset += window_size; + + // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused + // with next iteration + rocprim::syncthreads(); + } + } + + ROCPRIM_DEVICE ROCPRIM_INLINE void copy(storage& temp_storage, + copyable_buffers buffers, + uint32_t num_buffers, + copyable_blev_buffers blev_buffers, + blev_buffer_scan_state_type blev_buffer_scan_state, + blev_block_scan_state_type blev_block_scan_state, + const buffer_offset_type tile_id) + { + const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>(); + + // Offset into this tile's buffers + const buffer_offset_type buffer_offset = tile_id * buffers_per_block; + + // Indicates whether all of this tile's items are within bounds + bool is_full_tile = buffer_offset + buffers_per_block < num_buffers; + + // Load the buffer sizes of this tile's buffers + auto tile_buffer_sizes = buffers.sizes + buffer_offset; + + // Sizes of the buffers this thread should work on. + buffer_size_type buffer_sizes[buffers_per_thread]; + if(is_full_tile) + { + buffer_load_type{}.load(tile_buffer_sizes, + buffer_sizes, + temp_storage.shared.analysis.load_storage); + } + else + { + buffer_load_type{}.load(tile_buffer_sizes, + buffer_sizes, + num_buffers - buffer_offset, + 0, + temp_storage.shared.analysis.load_storage); + } + + // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids + rocprim::syncthreads(); + + // Count how many buffers fall into each size-class + auto size_class_histogram = get_buffer_size_class_histogram(buffer_sizes); + + // Prefix sum over size_class_histogram. + size_class_counter size_class_agg{}; + block_size_scan_type{}.exclusive_scan(size_class_histogram /* input */, + size_class_histogram /* output */, + size_class_counter{} /* initial */, + size_class_agg /* aggregate */, + temp_storage.shared.analysis.size_scan_storage, + rocprim::plus{}); + + rocprim::syncthreads(); + + uint32_t buffer_count = 0; + + // Factor in the per-size-class counts / offsets + // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset + // has to be offset by the TLEV+WLEV buffer count + for(const auto size_class : {batch_memcpy::size_class::tlev, + batch_memcpy::size_class::wlev, + batch_memcpy::size_class::blev}) + { + size_class_histogram.add(size_class, buffer_count); + buffer_count += size_class_agg.get(size_class); + } + + // Signal the number of BLEV buffers we're planning to write out + // Aggregate the count of blev buffers across threads. + buffer_offset_type buffer_exclusive_prefix{}; + if(tile_id == 0) + { + if(flat_block_thread_id == 0) + { + blev_buffer_scan_state.set_complete( + tile_id, + size_class_agg.get(batch_memcpy::size_class::blev)); + } + buffer_exclusive_prefix = 0; + } + else + { + blev_buffer_scan_prefix_callback_type blev_buffer_prefix_op{ + tile_id, + blev_buffer_scan_state, + temp_storage.shared.analysis.buffer_scan_callback, + rocprim::plus{}}; + + buffer_exclusive_prefix + = blev_buffer_prefix_op(size_class_agg.get(batch_memcpy::size_class::blev)); + } + if(flat_block_thread_id == 0) + { + temp_storage.blev_buffer_offset = buffer_exclusive_prefix; + } + + rocprim::syncthreads(); + + // Write partitions to shared memory. + partition_buffer_by_size(buffer_sizes, + size_class_histogram, + temp_storage.buffers_by_size_class); + rocprim::syncthreads(); + + // Get buffers for this tile. + copyable_buffers tile_buffer = copyable_buffers{ + buffers.srcs + buffer_offset, + buffers.dsts + buffer_offset, + buffers.sizes + buffer_offset, + }; + + auto num_blev_buffers = size_class_agg.get(batch_memcpy::size_class::blev); + auto num_wlev_buffers = size_class_agg.get(batch_memcpy::size_class::wlev); + auto num_tlev_buffers = size_class_agg.get(batch_memcpy::size_class::tlev); + + // BLEV buffers are copied in a seperate kernel. We need to prepare global memory + // to pass what needs to be copied where that kernel. + prepare_blev_buffers( + temp_storage.shared.prepare_blev, + &temp_storage + .buffers_by_size_class[size_class_agg.get(batch_memcpy::size_class::tlev) + + size_class_agg.get(batch_memcpy::size_class::wlev)], + tile_buffer, + num_blev_buffers, + blev_buffers, + temp_storage.blev_buffer_offset, + blev_block_scan_state, + tile_id); + + rocprim::syncthreads(); + + copy_wlev_buffers( + &temp_storage + .buffers_by_size_class[size_class_agg.get(batch_memcpy::size_class::tlev)], + tile_buffer, + num_wlev_buffers); + + copy_tlev_buffers(temp_storage.shared.copy_tlev, + temp_storage.buffers_by_size_class, + tile_buffer, + num_tlev_buffers); + } + }; + +public: + __global__ static void init_tile_state_kernel(blev_buffer_scan_state_type buffer_scan_state, + blev_block_scan_state_type block_scan_state, + tile_offset_type num_tiles) + { + const uint32_t block_id = rocprim::detail::block_id<0>(); + const uint32_t block_size = rocprim::detail::block_size<0>(); + const uint32_t block_thread_id = rocprim::detail::block_thread_id<0>(); + const uint32_t flat_thread_id = (block_id * block_size) + block_thread_id; + + buffer_scan_state.initialize_prefix(flat_thread_id, num_tiles); + + block_scan_state.initialize_prefix(flat_thread_id, num_tiles); + } + + __global__ static void + non_blev_memcpy_kernel(copyable_buffers buffers, + buffer_offset_type num_buffers, + copyable_blev_buffers blev_buffers, + blev_buffer_scan_state_type blev_buffer_scan_state, + blev_block_scan_state_type blev_block_scan_state) + { + ROCPRIM_SHARED_MEMORY typename non_blev_memcpy::storage_type temp_storage; + non_blev_memcpy{}.copy(temp_storage.get(), + buffers, + num_buffers, + blev_buffers, + blev_buffer_scan_state, + blev_block_scan_state, + rocprim::flat_block_id()); + } + + __global__ static void blev_memcpy_kernel(copyable_blev_buffers blev_buffers, + blev_buffer_scan_state_type buffer_offset_tile, + tile_offset_type last_tile_offset) + { + const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>(); + const auto flat_block_id = ::rocprim::detail::block_id<0>(); + const auto flat_grid_size = ::rocprim::detail::grid_size<0>(); + + constexpr auto blev_tile_size = blev_block_size * blev_bytes_per_thread; + const auto num_blev_buffers = buffer_offset_tile.get_complete_value(last_tile_offset); + + if(num_blev_buffers == 0) + { + return; + } + + uint32_t tile_id = flat_block_id; + while(true) + { + __shared__ buffer_offset_type shared_buffer_id; + + rocprim::syncthreads(); + + if(flat_block_thread_id == 0) + { + shared_buffer_id + = rocprim::upper_bound(blev_buffers.offsets, num_blev_buffers, tile_id) - 1; + } + + rocprim::syncthreads(); + + const buffer_offset_type buffer_id = shared_buffer_id; + + // The relative offset of this tile within the buffer it's assigned to + const buffer_size_type tile_offset_within_buffer + = (tile_id - blev_buffers.offsets[buffer_id]) * blev_tile_size; + + // If the tile has already reached beyond the work of the end of the last buffer + if(buffer_id >= num_blev_buffers - 1 + && tile_offset_within_buffer > blev_buffers.sizes[buffer_id]) + { + return; + } + + // Tiny remainders are copied without vectorizing loads + if(blev_buffers.sizes[buffer_id] - tile_offset_within_buffer <= 32) + { + buffer_size_type thread_offset = tile_offset_within_buffer + flat_block_thread_id; + for(uint32_t i = 0; i < blev_buffers_per_thread; + ++i, thread_offset += blev_block_size) + { + if(thread_offset < blev_buffers.sizes[buffer_id]) + { + uint8_t item + = batch_memcpy::read_byte(blev_buffers.srcs[buffer_id], thread_offset); + batch_memcpy::write_byte(blev_buffers.dsts[buffer_id], thread_offset, item); + } + } + tile_id += flat_grid_size; + continue; + } + + const buffer_size_type items_to_copy + = rocprim::min(static_cast(blev_buffers.sizes[buffer_id] + - tile_offset_within_buffer), + static_cast(blev_tile_size)); + + batch_memcpy::vectorized_copy_bytes(blev_buffers.srcs[buffer_id], + blev_buffers.dsts[buffer_id], + items_to_copy, + tile_offset_within_buffer); + + tile_id += flat_grid_size; + } + } +}; + +} // namespace detail + +END_ROCPRIM_NAMESPACE + +#endif diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index 9287e5b21..3bb37fda4 100644 --- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -36,6 +36,7 @@ #include "../../detail/various.hpp" #include "../config_types.hpp" +#include "rocprim/config.hpp" extern "C" { @@ -221,6 +222,23 @@ struct lookback_scan_state value = prefix.value; } + /// \brief Gets the prefix value for a block. Should only be called after all + /// blocks/prefixes are completed. + ROCPRIM_DEVICE ROCPRIM_INLINE T get_complete_value(const unsigned int block_id) + { + constexpr unsigned int padding = ::rocprim::device_warp_size(); + + auto p = prefixes[padding + block_id]; + prefix_type prefix{}; +#ifndef __HIP_CPU_RT__ + __builtin_memcpy(&prefix, &p, sizeof(prefix_type)); +#else + std::memcpy(&prefix, &p, sizeof(prefix_type)); +#endif + assert(prefix.flag == PREFIX_COMPLETE); + return prefix.value; + } + private: ROCPRIM_DEVICE ROCPRIM_INLINE void set(const unsigned int block_id, const flag_type flag, const T value) @@ -368,6 +386,16 @@ struct lookback_scan_state value = prefixes_complete_values[padding + block_id]; } + /// \brief Gets the prefix value for a block. Should only be called after all + /// blocks/prefixes are completed. + ROCPRIM_DEVICE ROCPRIM_INLINE T get_complete_value(const unsigned int block_id) + { + constexpr unsigned int padding = ::rocprim::device_warp_size(); + + assert(prefixes_flags[padding + block_id] == PREFIX_COMPLETE); + return prefixes_complete_values[padding + block_id]; + } + private: flag_type * prefixes_flags; // We need to separate arrays for partial and final prefixes, because diff --git a/rocprim/include/rocprim/device/device_memcpy.hpp b/rocprim/include/rocprim/device/device_memcpy.hpp new file mode 100644 index 000000000..e76a11e54 --- /dev/null +++ b/rocprim/include/rocprim/device/device_memcpy.hpp @@ -0,0 +1,346 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ +#define ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ + +#include "../config.hpp" +#include "../functional.hpp" + +#include "config_types.hpp" + +#include "detail/device_batch_memcpy.hpp" +#include "device_memcpy_config.hpp" +#include "rocprim/device/detail/device_config_helper.hpp" + +BEGIN_ROCPRIM_NAMESPACE + +/// \brief Copy `sizes[i]` bytes from `sources[i]` to `destinations[i]` for all `i` in the range [0, `num_copies`]. +/// +/// \tparam Config [optional] configuration of the primitive. It has to be \p batch_memcpy_config . +/// \tparam InputBufferItType type of iterator to source pointers. +/// \tparam OutputBufferItType type of iterator to desetination pointers. +/// \tparam BufferSizeItType type of iterator to sizes. +/// +/// \param [in] temporary_storage pointer to device-accessible temporary storage. +/// When a null pointer is passed, the required allocation size in bytes is written to +/// `storage_size` and the function returns without performing the copy. +/// \param [in, out] storage_size reference to the size in bytes of `temporary_storage`. +/// \param [in] sources iterator of source pointers. +/// \param [in] destinations iterator of destination pointers. +/// \param [in] sizes iterator of range sizes to copy. +/// \param [in] num_copies number of ranges to copy +/// \param [in] stream [optional] HIP stream object to enqueue the copy on. Default is `hipStreamDefault`. +/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel +/// launch is forced in order to check for errors. The default value is `false`. +/// +/// Performs multiple device to device memory copies as a single batched operation. +/// Roughly equivalent to +/// \code{.cpp} +/// for (auto i = 0; i < num_copies; ++i) { +/// char* src = sources[i]; +/// char* dst = destinations[i]; +/// auto size = sizes[i]; +/// hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice, stream); +/// } +/// \endcode +/// except executed on the device in parallel. +/// Note that sources and destinations do not have to be part of the same array. I.e. you can copy +/// from both array A and B to array C and D with a single call to this function. +/// Source ranges are allowed to overlap, +/// however, destinations overlapping with either other destinations or with sources is not allowed, +/// and will result in undefined behaviour. +/// +/// \par Example +/// \parblock +/// In this example multiple sections of data are copied from \p a to \p b . +/// +/// \code{.cpp} +/// #include +ROCPRIM_INLINE static hipError_t batch_memcpy(void* temporary_storage, + size_t& storage_size, + InputBufferItType sources, + OutputBufferItType destinations, + BufferSizeItType sizes, + uint32_t num_copies, + hipStream_t stream = hipStreamDefault, + bool debug_synchronous = false) +{ + using Config = detail::default_or_custom_config>; + + static_assert(Config::wlev_size_threshold < Config::blev_size_threshold, + "wlev_size_threshold should be smaller than blev_size_threshold"); + + using BufferOffsetType = unsigned int; + using BlockOffsetType = unsigned int; + + hipError_t error = hipSuccess; + + using batch_memcpy_impl_type = detail:: + batch_memcpy_impl; + + static constexpr uint32_t non_blev_block_size = Config::non_blev_block_size; + static constexpr uint32_t non_blev_buffers_per_thread = Config::non_blev_buffers_per_thread; + static constexpr uint32_t blev_block_size = Config::blev_block_size; + + constexpr uint32_t buffers_per_block = non_blev_block_size * non_blev_buffers_per_thread; + const uint32_t num_blocks = rocprim::detail::ceiling_div(num_copies, buffers_per_block); + + using scan_state_buffer_type = rocprim::detail::lookback_scan_state; + using scan_state_block_type = rocprim::detail::lookback_scan_state; + + // Pack buffers + typename batch_memcpy_impl_type::copyable_buffers const buffers{ + sources, + destinations, + sizes, + }; + + detail::temp_storage::layout scan_state_buffer_layout{}; + error = scan_state_buffer_type::get_temp_storage_layout(num_blocks, + stream, + scan_state_buffer_layout); + if(error != hipSuccess) + { + return error; + } + + detail::temp_storage::layout blev_block_scan_state_layout{}; + error = scan_state_block_type::get_temp_storage_layout(num_blocks, + stream, + blev_block_scan_state_layout); + if(error != hipSuccess) + { + return error; + } + + uint8_t* blev_buffer_scan_data; + uint8_t* blev_block_scan_state_data; + + // The non-blev kernel will prepare blev copy. Communication between the two + // kernels is done via `blev_buffers`. + typename batch_memcpy_impl_type::copyable_blev_buffers blev_buffers{}; + + // Partition `d_temp_storage`. + // If `d_temp_storage` is null, calculate the allocation size instead. + error = detail::temp_storage::partition( + temporary_storage, + storage_size, + detail::temp_storage::make_linear_partition( + detail::temp_storage::ptr_aligned_array(&blev_buffers.srcs, num_copies), + detail::temp_storage::ptr_aligned_array(&blev_buffers.dsts, num_copies), + detail::temp_storage::ptr_aligned_array(&blev_buffers.sizes, num_copies), + detail::temp_storage::ptr_aligned_array(&blev_buffers.offsets, num_copies), + detail::temp_storage::make_partition(&blev_buffer_scan_data, scan_state_buffer_layout), + detail::temp_storage::make_partition(&blev_block_scan_state_data, + blev_block_scan_state_layout))); + + // If allocation failed, return error. + if(error != hipSuccess) + { + return error; + } + + // Return the storage size. + if(temporary_storage == nullptr) + { + return hipSuccess; + } + + // Compute launch parameters. + + int device_id = hipGetStreamDeviceId(stream); + + // Get the number of multiprocessors + int multiprocessor_count{}; + error = hipDeviceGetAttribute(&multiprocessor_count, + hipDeviceAttributeMultiprocessorCount, + device_id); + if(error != hipSuccess) + { + return error; + } + + // `hipOccupancyMaxActiveBlocksPerMultiprocessor` uses the default device. + // We need to perserve the current default device id while we change it temporarily + // to get the max occupancy on this stream. + int previous_device; + error = hipGetDevice(&previous_device); + if(error != hipSuccess) + { + return error; + } + + error = hipSetDevice(device_id); + if(error != hipSuccess) + { + return error; + } + + int blev_occupancy{}; + error = hipOccupancyMaxActiveBlocksPerMultiprocessor(&blev_occupancy, + batch_memcpy_impl_type::blev_memcpy_kernel, + blev_block_size, + 0 /* dynSharedMemPerBlk */); + if(error != hipSuccess) + { + return error; + } + + // Restore the default device id to initial state + error = hipSetDevice(previous_device); + if(error != hipSuccess) + { + return error; + } + + constexpr BlockOffsetType init_kernel_threads = 128; + const BlockOffsetType init_kernel_grid_size + = rocprim::detail::ceiling_div(num_blocks, init_kernel_threads); + + auto batch_memcpy_blev_grid_size + = multiprocessor_count * blev_occupancy * 1 /* subscription factor */; + + BlockOffsetType batch_memcpy_grid_size = num_blocks; + + // Prepare init_scan_states_kernel. + scan_state_buffer_type scan_state_buffer{}; + error = scan_state_buffer_type::create(scan_state_buffer, + blev_buffer_scan_data, + num_blocks, + stream); + if(error != hipSuccess) + { + return error; + } + + scan_state_block_type scan_state_block{}; + error = scan_state_block_type::create(scan_state_block, + blev_block_scan_state_data, + num_blocks, + stream); + if(error != hipSuccess) + { + return error; + } + + // Launch init_scan_states_kernel. + batch_memcpy_impl_type:: + init_tile_state_kernel<<>>( + scan_state_buffer, + scan_state_block, + num_blocks); + error = hipGetLastError(); + if(error != hipSuccess) + { + return error; + } + if(debug_synchronous) + { + hipStreamSynchronize(stream); + } + + // Launch batch_memcpy_non_blev_kernel. + batch_memcpy_impl_type:: + non_blev_memcpy_kernel<<>>( + buffers, + num_copies, + blev_buffers, + scan_state_buffer, + scan_state_block); + error = hipGetLastError(); + if(error != hipSuccess) + { + return error; + } + if(debug_synchronous) + { + hipStreamSynchronize(stream); + } + + // Launch batch_memcpy_blev_kernel. + batch_memcpy_impl_type:: + blev_memcpy_kernel<<>>( + blev_buffers, + scan_state_buffer, + batch_memcpy_grid_size - 1); + error = hipGetLastError(); + if(error != hipSuccess) + { + return error; + } + if(debug_synchronous) + { + hipStreamSynchronize(stream); + } + + return hipSuccess; +} + +END_ROCPRIM_NAMESPACE + +#endif diff --git a/rocprim/include/rocprim/device/device_memcpy_config.hpp b/rocprim/include/rocprim/device/device_memcpy_config.hpp new file mode 100644 index 000000000..272b134fc --- /dev/null +++ b/rocprim/include/rocprim/device/device_memcpy_config.hpp @@ -0,0 +1,81 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_MEMCPY_CONFIG_HPP_ +#define ROCPRIM_DEVICE_DEVICE_MEMCPY_CONFIG_HPP_ + +#include "config_types.hpp" +#include "detail/device_config_helper.hpp" + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +/// \brief +/// +/// \tparam NonBlevBlockSize - number of threads per block for thread- and warp-level copy. +/// \tparam NonBlevBuffersPerThreaed - number of buffers processed per thread. +/// \tparam TlevBytesPerThread - number of bytes per thread for thread-level copy. +/// \tparam BlevBlockSize - number of thread per block for block-level copy. +/// \tparam BlevBytesPerThread - number of bytes per thread for block-level copy. +/// \tparam WlevSizeThreshold - minimum size to use warp-level copy instead of thread-level. +/// \tparam BlevSizeThreshold - minimum size to use block-level copy instead of warp-level. +template +struct batch_memcpy_config +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS + + /// \brief Number of threads per block for thread- and warp-level copy + static constexpr unsigned int non_blev_block_size = NonBlevBlockSize; + + /// \brief Number of buffers processed per thread + static constexpr unsigned int non_blev_buffers_per_thread = NonBlevBuffersPerThreaed; + + /// \brief Number of bytes per thread for thread-level copy + static constexpr unsigned int tlev_bytes_per_thread = TlevBytesPerThread; + + /// \brief Number of thread per block for block-level copy + static constexpr unsigned int blev_block_size = BlevBlockSize; + + /// \brief Number of bytes per thread for block-level copy + static constexpr unsigned int blev_bytes_per_thread = BlevBytesPerThread; + + /// \brief Minimum size to use warp-level copy instead of thread-level + static constexpr unsigned int wlev_size_threshold = WlevSizeThreshold; + + /// \brief Minimum size to use block-level copy instead of warp-level + static constexpr unsigned int blev_size_threshold = BlevSizeThreshold; + +#endif +}; + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 7a25d3cc9..ba192af87 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -24,6 +24,8 @@ #include "../config.hpp" #include "../types.hpp" +#include + BEGIN_ROCPRIM_NAMESPACE /// \addtogroup intrinsicsmodule @@ -142,11 +144,25 @@ ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label, bool ROCPRIM_UNROLL for(unsigned int bit = 0; bit < LabelBits; ++bit) { - const auto bit_set = label & (1u << bit); - // Create mask of threads which have the same bit set or unset. - const auto same_mask = ballot(bit_set); - // Remove bits which do not match from the peer mask. - peer_mask &= (bit_set ? same_mask : ~same_mask); + static constexpr int lane_width = std::numeric_limits::digits; + using lane_mask_type_s = std::make_signed_t; + const auto label_signed = static_cast(label); + + // Get all zeros or all ones depending on label's i-th bit. + // Moves the bit into the sign position by left shifting, then shifts it into all the bits + // by (arithmetic) right shift which does sign-extension. + const lane_mask_type_s bit_set + = (label_signed << (lane_width - 1 - bit)) >> (lane_width - 1); + + // Remove all lanes from the mask with a bit that differs from ours + // - if we have the bit set we keep the lanes that do too so we mask with the result + // of the ballot + // - if we don't have it, then we keep the lanes that also don't, so we flip all bits + // in the mask before and-ing. + // since bit_set is all ones if we have the bit and all zeros if not, the flipping is + // the same as xor-ing with its inverse + const lane_mask_type bit_set_mask = ballot(bit_set); + peer_mask &= bit_set_mask ^ ~bit_set; } return -lane_mask_type{valid} & peer_mask; diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp index 6a2ecabf8..5b3dd0e1d 100644 --- a/rocprim/include/rocprim/rocprim.hpp +++ b/rocprim/include/rocprim/rocprim.hpp @@ -53,15 +53,16 @@ #include "device/device_adjacent_difference.hpp" #include "device/device_binary_search.hpp" #include "device/device_histogram.hpp" +#include "device/device_memcpy.hpp" #include "device/device_merge.hpp" #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" -#include "device/device_reduce_by_key.hpp" #include "device/device_reduce.hpp" +#include "device/device_reduce_by_key.hpp" #include "device/device_run_length_encode.hpp" -#include "device/device_scan_by_key.hpp" #include "device/device_scan.hpp" +#include "device/device_scan_by_key.hpp" #include "device/device_segmented_radix_sort.hpp" #include "device/device_segmented_reduce.hpp" #include "device/device_segmented_scan.hpp" diff --git a/test/extra/test_rocprim_package.cpp b/test/extra/test_rocprim_package.cpp index d0a2da3f8..6b1fa0703 100644 --- a/test/extra/test_rocprim_package.cpp +++ b/test/extra/test_rocprim_package.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -26,14 +26,19 @@ #include #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << error << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + do \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "Error " << hipGetErrorName(error) << '(' << error << ')' << ": " \ + << hipGetErrorString(error) << " in " << __func__ << " at " << __FILE__ \ + << ':' << __LINE__ << '\n'; \ + exit(error); \ + } \ + } \ + while(false) int main(int, char**) { diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 8db2d5c5b..3fbc675ce 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -241,6 +241,7 @@ add_rocprim_test("rocprim.block_sort_bitonic" test_block_sort_bitonic.cpp) add_rocprim_test("rocprim.config_dispatch" test_config_dispatch.cpp) add_rocprim_test("rocprim.constant_iterator" test_constant_iterator.cpp) add_rocprim_test("rocprim.counting_iterator" test_counting_iterator.cpp) +add_rocprim_test("rocprim.device_batch_memcpy" test_device_batch_memcpy.cpp) add_rocprim_test("rocprim.device_binary_search" test_device_binary_search.cpp) add_rocprim_test("rocprim.device_adjacent_difference" test_device_adjacent_difference.cpp) add_rocprim_test("rocprim.device_histogram" test_device_histogram.cpp) diff --git a/test/rocprim/test_device_batch_memcpy.cpp b/test/rocprim/test_device_batch_memcpy.cpp new file mode 100644 index 000000000..64986a796 --- /dev/null +++ b/test/rocprim/test_device_batch_memcpy.cpp @@ -0,0 +1,329 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "common_test_header.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_types.hpp" + +#include "rocprim/detail/various.hpp" +#include "rocprim/device/device_memcpy.hpp" +#include "rocprim/intrinsics/thread.hpp" + +#include +#include + +#include +#include +#include +#include + +#include + +#include + +template +struct DeviceBatchMemcpyParams +{ + using value_type = ValueType; + using size_type = SizeType; + static constexpr bool shuffled = Shuffled; + static constexpr uint32_t num_buffers = NumBuffers; + static constexpr uint32_t max_size = MaxSize; +}; + +template +struct DeviceBatchMemcpyTests : public ::testing::Test +{ + using value_type = typename Params::value_type; + using size_type = typename Params::size_type; + static constexpr bool shuffled = Params::shuffled; + static constexpr uint32_t num_buffers = Params::num_buffers; + static constexpr uint32_t max_size = Params::max_size; +}; + +typedef ::testing::Types< + // Ignore copy/move + DeviceBatchMemcpyParams, uint32_t, false>, + DeviceBatchMemcpyParams, uint32_t, false>, + DeviceBatchMemcpyParams, uint32_t, false>, + + // Unshuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // weird amount of buffers + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // Shuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams> + DeviceBatchMemcpyTestsParams; + +TYPED_TEST_SUITE(DeviceBatchMemcpyTests, DeviceBatchMemcpyTestsParams); + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const size_t n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + T sum = 0; + for(size_t i = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) +{ + using value_type = typename TestFixture::value_type; + using buffer_size_type = typename TestFixture::size_type; + using buffer_offset_type = uint32_t; + using byte_offset_type = size_t; + + constexpr int32_t num_buffers = TestFixture::num_buffers; + constexpr int32_t max_size = TestFixture::max_size; + constexpr bool shuffled = TestFixture::shuffled; + + constexpr int32_t wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold; + constexpr int32_t blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold; + + constexpr int32_t wlev_min_elems + = rocprim::detail::ceiling_div(wlev_min_size, sizeof(value_type)); + constexpr int32_t blev_min_elems + = rocprim::detail::ceiling_div(blev_min_size, sizeof(value_type)); + constexpr int32_t max_elems = max_size / sizeof(value_type); + + constexpr int32_t enabled_size_categories + = (blev_min_elems <= max_elems) + (wlev_min_elems <= max_elems) + 1; + + constexpr int32_t num_blev + = blev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0; + constexpr int32_t num_wlev + = wlev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0; + constexpr int32_t num_tlev = num_buffers - num_blev - num_wlev; + + // Get random buffer sizes + uint32_t seed = 0; + SCOPED_TRACE(testing::Message() << "with seed= " << seed); + std::mt19937_64 rng{seed}; + + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = test_utils::generate_random_data_n(iter, num_tlev, 1, wlev_min_elems - 1, rng); + iter = test_utils::generate_random_data_n(iter, + num_wlev, + wlev_min_elems, + blev_min_elems - 1, + rng); + iter = test_utils::generate_random_data_n(iter, num_blev, blev_min_elems, max_elems, rng); + + const byte_offset_type total_num_elements = std::accumulate(h_buffer_num_elements.begin(), + h_buffer_num_elements.end(), + byte_offset_type{0}); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(value_type); + } + + // And the total byte size + const byte_offset_type total_num_bytes = total_num_elements * sizeof(value_type); + + // Device pointers + value_type* d_input = nullptr; + value_type* d_output = nullptr; + value_type** d_buffer_srcs = nullptr; + value_type** d_buffer_dsts = nullptr; + buffer_size_type* d_buffer_sizes = nullptr; + + // Calculate temporary storage + + size_t temp_storage_bytes = 0; + + HIP_CHECK(rocprim::batch_memcpy(nullptr, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + + // Allocate memory. + HIP_CHECK(hipMalloc(&d_input, total_num_bytes)); + HIP_CHECK(hipMalloc(&d_output, total_num_bytes)); + + HIP_CHECK(hipMalloc(&d_buffer_srcs, num_buffers * sizeof(*d_buffer_srcs))); + HIP_CHECK(hipMalloc(&d_buffer_dsts, num_buffers * sizeof(*d_buffer_dsts))); + HIP_CHECK(hipMalloc(&d_buffer_sizes, num_buffers * sizeof(*d_buffer_sizes))); + + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + // generate_n for uninitialized memory, pragmatically use placement-new, since there are no + // uint64_t objects alive yet in the storage. + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffled) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } + else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(int32_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = d_input + src_offsets[i]; + h_buffer_dsts[i] = d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK(hipMemcpy(d_input, h_input.get(), total_num_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(*d_buffer_srcs), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(*d_buffer_dsts), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(*d_buffer_sizes), + hipMemcpyHostToDevice)); + + // Run batched memcpy. + HIP_CHECK(rocprim::batch_memcpy(d_temp_storage, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers, + hipStreamDefault)); + // Verify results. + auto h_output = std::make_unique(total_num_bytes); + HIP_CHECK(hipMemcpy(h_output.get(), d_output, total_num_bytes, hipMemcpyDeviceToHost)); + + for(int32_t i = 0; i < num_buffers; ++i) + { + ASSERT_EQ(std::memcmp(h_input.get() + src_offsets[i] * sizeof(value_type), + h_output.get() + dst_offsets[i] * sizeof(value_type), + h_buffer_num_bytes[i]), + 0) + << "with index = " << i; + } + + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); +} diff --git a/test/rocprim/test_utils_assertions.hpp b/test/rocprim/test_utils_assertions.hpp index 3e9ff4e93..45b65583e 100644 --- a/test/rocprim/test_utils_assertions.hpp +++ b/test/rocprim/test_utils_assertions.hpp @@ -21,18 +21,24 @@ #ifndef ROCPRIM_TEST_UTILS_ASSERTIONS_HPP #define ROCPRIM_TEST_UTILS_ASSERTIONS_HPP -// Std::memcpy and std::memcmp -#include - #include "test_utils_half.hpp" #include "test_utils_bfloat16.hpp" #include "test_utils_custom_test_types.hpp" +#include + +#include + +// Std::memcpy and std::memcmp +#include +#include + namespace test_utils { // begin assert_eq template -bool inline bit_equal(T a, T b){ +bool inline bit_equal(const T& a, const T& b) +{ return std::memcmp(&a, &b, sizeof(T))==0; } diff --git a/test/rocprim/test_utils_bfloat16.hpp b/test/rocprim/test_utils_bfloat16.hpp index 9c3f68a5e..ad3b298f2 100644 --- a/test/rocprim/test_utils_bfloat16.hpp +++ b/test/rocprim/test_utils_bfloat16.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -21,6 +21,8 @@ #ifndef ROCPRIM_TEST_UTILS_BFLOAT16_HPP #define ROCPRIM_TEST_UTILS_BFLOAT16_HPP +#include + namespace test_utils { using bfloat16 = rocprim::bfloat16; diff --git a/test/rocprim/test_utils_custom_test_types.hpp b/test/rocprim/test_utils_custom_test_types.hpp index 5a9810e55..431547271 100644 --- a/test/rocprim/test_utils_custom_test_types.hpp +++ b/test/rocprim/test_utils_custom_test_types.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,9 @@ #include "test_utils_half.hpp" #include "test_utils_bfloat16.hpp" +#include "rocprim/functional.hpp" +#include + namespace test_utils { template @@ -123,6 +126,40 @@ struct custom_test_type } }; +template +struct custom_non_copyable_type +{ + T x; + + custom_non_copyable_type() = default; + custom_non_copyable_type(const custom_non_copyable_type&) = delete; + custom_non_copyable_type(custom_non_copyable_type&&) = default; + ~custom_non_copyable_type() = default; + custom_non_copyable_type& operator=(const custom_non_copyable_type&) = delete; + custom_non_copyable_type& operator=(custom_non_copyable_type&&) = default; +}; + +template +struct custom_non_moveable_type +{ + T x; + + custom_non_moveable_type() = default; + custom_non_moveable_type(const custom_non_moveable_type&) = delete; + custom_non_moveable_type(custom_non_moveable_type&&) = delete; + ~custom_non_moveable_type() = default; + custom_non_moveable_type& operator=(const custom_non_moveable_type&) = delete; + custom_non_moveable_type& operator=(custom_non_moveable_type&&) = delete; +}; + +template +struct custom_non_default_type +{ + T x; + + custom_non_default_type() = delete; +}; + // Custom type used in tests // Loops are prevented from being unrolled due to a compiler bug in ROCm 5.2 for device code template @@ -306,5 +343,35 @@ struct inner_type> { using type = T; }; + +template +struct inner_type> +{ + using type = T; +}; + +template +struct inner_type> +{ + using type = T; +}; + +template +struct inner_type> +{ + using type = T; +}; + +template +struct is_custom_test_type> : std::true_type +{}; + +template +struct is_custom_test_type> : std::true_type +{}; + +template +struct is_custom_test_type> : std::true_type +{}; } #endif //ROCPRIM_TEST_UTILS_CUSTOM_TEST_TYPES_HPP diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp index 74ea08440..315e92d23 100644 --- a/test/rocprim/test_utils_data_generation.hpp +++ b/test/rocprim/test_utils_data_generation.hpp @@ -21,18 +21,20 @@ #ifndef ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP #define ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP -// Std::memcpy and std::memcmp -#include -#include +#include "common_test_header.hpp" +#include "test_utils_bfloat16.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_half.hpp" #include #include #include -#include "common_test_header.hpp" -#include "test_utils_bfloat16.hpp" -#include "test_utils_custom_test_types.hpp" -#include "test_utils_half.hpp" +// Std::memcpy and std::memcmp +#include +#include +#include +#include namespace test_utils { @@ -182,95 +184,74 @@ void add_special_values(std::vector& source, seed_type seed_value) } } -template -inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> - typename std::enable_if::value, std::vector>::type +template +using it_value_t = typename std::iterator_traits::value_type; + +template +inline OutputIter segmented_generate_n(OutputIter it, size_t size, Generator gen) { - engine_type gen{seed_value}; + const size_t segment_size = size / random_data_generation_segments; + if(segment_size == 0) + { + return std::generate_n(it, size, std::move(gen)); + } + + for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; + segment_index++) + { + if(segment_index % random_data_generation_repeat_strides == 0) + { + const auto repeated_value = gen(); + std::fill(it + segment_size * segment_index, + it + segment_size * (segment_index + 1), + repeated_value); + } + else + { + std::generate_n(it + segment_size * segment_index, segment_size, gen); + } + } + return it + size; +} + +template +inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen) + -> std::enable_if_t, __int128_t>::value, OutputIter> +{ + using T = it_value_t; + using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, typename std::conditional::value, int, unsigned int>::type>::type; std::uniform_int_distribution distribution(static_cast(min), static_cast(max)); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; - segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = static_cast(distribution(gen)); - std::fill(data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - } - else - { - std::generate(data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return static_cast(distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), - data.end(), - [&]() { return static_cast(distribution(gen)); }); - } - return data; + + return segmented_generate_n(it, size, [&]() { return static_cast(distribution(gen)); }); } -template -inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> - typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen) + -> std::enable_if_t, __uint128_t>::value, OutputIter> { - engine_type gen{seed_value}; + using T = it_value_t; + using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, typename std::conditional::value, int, unsigned int>::type>::type; std::uniform_int_distribution distribution(static_cast(min), static_cast(max)); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; - segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = static_cast(distribution(gen)); - std::fill(data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - } - else - { - std::generate(data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return static_cast(distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), - data.end(), - [&]() { return static_cast(distribution(gen)); }); - } - return data; + + return segmented_generate_n(it, size, [&]() { return static_cast(distribution(gen)); }); } -template -inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) - -> typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen) + -> std::enable_if_t>::value, OutputIter> { - engine_type gen{seed_value}; + using T = it_value_t; + using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, @@ -278,195 +259,121 @@ inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) int, unsigned int>::type >::type; - std::uniform_int_distribution distribution(static_cast(min), static_cast(max)); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = static_cast(distribution(gen)); - std::fill( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - - } - else - { - std::generate( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return static_cast(distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), data.end(), [&]() { return static_cast(distribution(gen)); }); - } - return data; + std::uniform_int_distribution distribution(static_cast(min), + static_cast(max)); + + return segmented_generate_n(it, size, [&]() { return static_cast(distribution(gen)); }); } -template -inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) - -> typename std::enable_if::value, std::vector>::type +template +inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen) + -> std::enable_if_t>::value + && !is_custom_test_type>::value, + OutputIter> { - engine_type gen{seed_value}; + using T = it_value_t; + // Generate floats when T is half or bfloat16 using dis_type = typename std::conditional::value || std::is_same::value, float, T>::type; - std::uniform_real_distribution distribution(static_cast(min), static_cast(max)); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = static_cast(distribution(gen)); - std::fill( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - - } - else - { - std::generate( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return static_cast(distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), data.end(), [&]() { return static_cast(distribution(gen)); }); + std::uniform_real_distribution distribution(static_cast(min), + static_cast(max)); - } - return data; + return segmented_generate_n(it, size, [&]() { return static_cast(distribution(gen)); }); } -template -inline auto get_random_data(size_t size, T min, T max, seed_type seed_value) - -> typename std::enable_if< - is_custom_test_type::value && std::is_integral::value, - std::vector - >::type +template +inline auto generate_random_data_n(OutputIter it, + size_t size, + it_value_t min, + it_value_t max, + Generator& gen) + -> std::enable_if_t>::value + && std::is_integral::value_type>::value, + OutputIter> { - engine_type gen(seed_value); + using T = it_value_t; + std::uniform_int_distribution distribution(min.x, max.x); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = T(distribution(gen), distribution(gen)); - std::fill( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - - } - else - { - std::generate( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return T(distribution(gen), distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); - } - return data; + + return segmented_generate_n(it, + size, + [&]() { return T(distribution(gen), distribution(gen)); }); } -template -inline auto get_random_data(size_t size, T min, T max, seed_type seed_value) - -> typename std::enable_if< - is_custom_test_type::value && std::is_floating_point::value, - std::vector - >::type +template +inline auto generate_random_data_n(OutputIter it, + size_t size, + it_value_t min, + it_value_t max, + Generator& gen) + -> std::enable_if_t< + is_custom_test_type>::value + && std::is_floating_point::value_type>::value, + OutputIter> { - engine_type gen(seed_value); + using T = typename std::iterator_traits::value_type; + std::uniform_real_distribution distribution(min.x, max.x); - std::vector data(size); - size_t segment_size = size / random_data_generation_segments; - if(segment_size != 0) - { - for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) - { - if(segment_index % random_data_generation_repeat_strides == 0) - { - T repeated_value = T(distribution(gen), distribution(gen)); - std::fill( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - repeated_value); - - } - else - { - std::generate( - data.begin() + segment_size * segment_index, - data.begin() + segment_size * (segment_index + 1), - [&]() { return T(distribution(gen), distribution(gen)); }); - } - } - } - else - { - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); - } - return data; + + return segmented_generate_n(it, + size, + [&]() { return T(distribution(gen), distribution(gen)); }); } -template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, seed_type seed_value) - -> typename std::enable_if< - is_custom_test_array_type::value && std::is_integral::value, - std::vector - >::type +template +inline auto generate_random_data_n(OutputIter it, + size_t size, + typename it_value_t::value_type min, + typename it_value_t::value_type max, + Generator& gen) + -> std::enable_if_t>::value + && std::is_integral::value_type>::value, + OutputIter> { - engine_type gen(seed_value); + using T = typename std::iterator_traits::value_type; + std::uniform_int_distribution distribution(min, max); + return std::generate_n(it, + size, + [&]() + { + T result; + for(size_t i = 0; i < T::size; i++) + { + result.values[i] = distribution(gen); + } + return result; + }); +} + +template +inline std::vector get_random_data(size_t size, U min, V max, seed_type seed_value) +{ std::vector data(size); - std::generate( - data.begin(), data.end(), - [&]() - { - T result; - for(size_t i = 0; i < T::size; i++) - { - result.values[i] = distribution(gen); - } - return result; - } - ); + engine_type gen(seed_value); + generate_random_data_n(data.begin(), size, min, max, gen); return data; } template inline auto get_random_value(U min, V max, seed_type seed_value) - -> typename std::enable_if::value, T>::type + -> std::enable_if_t::value, T> { - return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; + T result; + engine_type gen(seed_value); + generate_random_data_n(&result, 1, min, max, gen); + return result; } template -inline auto get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value) - -> typename std::enable_if::value || is_custom_test_array_type::value, T>::type +inline auto + get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value) + -> std::enable_if_t::value || is_custom_test_array_type::value, T> { - return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; + typename T::value_type result; + engine_type gen(seed_value); + generate_random_data_n(&result, 1, min, max, gen); + return T{result}; } template diff --git a/test/rocprim/test_utils_half.hpp b/test/rocprim/test_utils_half.hpp index d2f2918a3..b8eb05435 100644 --- a/test/rocprim/test_utils_half.hpp +++ b/test/rocprim/test_utils_half.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -21,6 +21,8 @@ #ifndef ROCPRIM_TEST_UTILS_HALF_HPP #define ROCPRIM_TEST_UTILS_HALF_HPP +#include + namespace test_utils { using half = rocprim::half;