From 8118351195e3e92d8e70fda9fefc58b0bf3a4aea Mon Sep 17 00:00:00 2001
From: Nara <nara@streamhpc.com>
Date: Wed, 22 Nov 2023 00:06:32 +0100
Subject: [PATCH] StreamHPC 2023-11-17 (batch memcpy) (#485)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Implemented batch memcpy algorithm and relevant tests and benchmarks

* Optimize match_any by using arithmetic shifts

The compiler seems to see through these much better than the conditional,
generating bit-field extract instructions, and recognizing that the loop
is a reduction.

* Pedantic / consistency changes for batch memcpy

* Improve interface and implementation of align_(up|down)

- Use the alignment of the destination type instead of its size
- Rename to emphasize that this does a form of reinterpret_cast
- Use the same type as the return type and template parameter, to
  match the interface of built-in casts
- Pedantic: use uintptr_t instead of size_t for the numerical value
  of a pointer
- Use clangs __builtin_align_(up|down) when available

* Take parameters as explicit const-ref in test_utils::bit_equal

Because these are templates this already works for non-copyable types,
(as `T` will be deduced to `Type&`) but its confusing, and wouldn't work
for r-values. Because we are comparing object representations taking a copy
isn't okay as that only guarantees that the value representation is copied.
(I.e. padding bytes are not required to be copied when taking a parameter
 by copy)

* Actually make custom_non(copyable|moveable)_type non (copy|move)-able

* Allow passing rocprim::default_config to batch_memcpy

As all the other device functions do too.

* Fix typo in cast_align_down documentation

* Fixup accidentally deleted constructor of custom_non_moveable_type

This was accidentally deleted, it was meant to be defaulted.
Currently no test calls this as batch-memcpy tests only use this type
at the device side.

* Improve error message of test_rocprim_package

The error message of the package test wasn't very nice, improve it
for easier debugging in the future.

Before:
```console
❯ ./a.out
98
```

After:
```console
❯ ./a.out
Error hipErrorInvalidDeviceFunction(98): invalid device function in main at test_rocprim_package.cpp:90
```

* Refactor test_utils::get_random_data into generate_random_data_n

- Writes the output into an output iterator instead of creating &
  returning a vector. This allows greater flexibility for users
  i.e. writing random values with differing options into the same
  container.
- Accepts a generator instead of a seed. This is more efficient, because
  creating an instance of an rng engine might be costly. It's also
  more consistent with how the standard library operates.
- The naming and interface tries to mirror the stl (i.e. `std::generate_n`)
- Backwards compatibility is maintained by adding test_utils::get_random_data
  that uses `generate_random_data_n` internally.

* Refactor get_random_data into generate_random_data_n in benchmark_utils

This mirrors the test changes in the previous commit

* Unify segmnented generation from test generate_random_data_n overloads

* Add missing include for iterator traits to benchmark_utils

* ci: use build instead rocm-build tag

This allows the build job to be performed by any runner configured
for building, instead of the ROCm-specialized builder. As the
target architectures are specified ahead of time, the GPU is not
needed during the build process, and may be performed by any builder.

* fix: Fixed doxygen warning in device_memcpy_config.hpp

* Speed up / Improve data-generation in test_device_batch_memcpy

Do bulk data-generation instead of individual calls, especially of
individual bytes for the data to copy.
Also changes the verification to do bulk memcmp instead of item-wise
test_utils::bit_equals for each buffer.
Overall this reduces the time it takes to run the test to ~1s from
around 3s.

* Refactor & Speedup benchmark_device_batch_memcpy

- Share the data generation between the naive and uut benchmarks
- Make the data-generation be bulk using a fast random number engine
  (mt19937) to significantly speed it up.

The overall runtime of the benchmark decreased from 14 minutes (!) to
around 2 minutes.

* Fix explanation comment in batch_memcpy test/benchmark

* fix include order in benchmark_device_batch_memcpy

* doc: add batch memcpy to changelog

---------

Co-authored-by: Gergely Meszaros <gergely@streamhpc.com>
Co-authored-by: Robin Voetter <robin@streamhpc.com>
---
 .gitlab-ci.yml                                |  14 +-
 CHANGELOG.md                                  |   2 +
 benchmark/CMakeLists.txt                      |   1 +
 benchmark/benchmark_device_batch_memcpy.cpp   | 522 ++++++++++
 benchmark/benchmark_utils.hpp                 | 121 ++-
 rocprim/include/rocprim/detail/various.hpp    |  56 +-
 .../device/detail/device_batch_memcpy.hpp     | 947 ++++++++++++++++++
 .../device/detail/lookback_scan_state.hpp     |  28 +
 .../include/rocprim/device/device_memcpy.hpp  | 346 +++++++
 .../rocprim/device/device_memcpy_config.hpp   |  81 ++
 rocprim/include/rocprim/intrinsics/warp.hpp   |  26 +-
 rocprim/include/rocprim/rocprim.hpp           |   5 +-
 test/extra/test_rocprim_package.cpp           |  23 +-
 test/rocprim/CMakeLists.txt                   |   1 +
 test/rocprim/test_device_batch_memcpy.cpp     | 329 ++++++
 test/rocprim/test_utils_assertions.hpp        |  14 +-
 test/rocprim/test_utils_bfloat16.hpp          |   4 +-
 test/rocprim/test_utils_custom_test_types.hpp |  69 +-
 test/rocprim/test_utils_data_generation.hpp   | 381 +++----
 test/rocprim/test_utils_half.hpp              |   4 +-
 20 files changed, 2666 insertions(+), 308 deletions(-)
 create mode 100644 benchmark/benchmark_device_batch_memcpy.cpp
 create mode 100644 rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
 create mode 100644 rocprim/include/rocprim/device/device_memcpy.hpp
 create mode 100644 rocprim/include/rocprim/device/device_memcpy_config.hpp
 create mode 100644 test/rocprim/test_device_batch_memcpy.cpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0229df12c..66c0092e6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -49,7 +49,7 @@ clang-format:
   stage: lint
   needs: []
   tags:
-    - rocm-build
+    - build
   variables:
     CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format"
     GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format"
@@ -66,7 +66,7 @@ copyright-date:
   stage: lint
   needs: []
   tags:
-    - rocm-build
+    - build
   rules:
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
   script:
@@ -101,7 +101,7 @@ copyright-date:
 .build:vcpkg-apt:
   stage: build
   tags:
-    - rocm-build
+    - build
   extends:
     - .gpus:rocm-gpus
     - .rules:build
@@ -157,7 +157,7 @@ build:cmake-minimum-apt:
 .build:common:
   stage: build
   tags:
-    - rocm-build
+    - build
   extends:
     - .gpus:rocm-gpus
     - .rules:build
@@ -207,7 +207,7 @@ build:package:
   stage: build
   needs: []
   tags:
-    - rocm-build
+    - build
   extends:
     - .cmake-minimum
     - .gpus:rocm-gpus
@@ -232,7 +232,7 @@ build:benchmark:
   stage: build
   needs: []
   tags:
-    - rocm-build
+    - build
   extends:
     - .cmake-minimum
     - .gpus:rocm-gpus
@@ -264,7 +264,7 @@ autotune:build:
   stage: autotune
   needs: []
   tags:
-    - rocm-build
+    - build
   extends:
     - .cmake-minimum
     - .gpus:rocm-gpus
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 693593bc7..bca4c4f82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,12 +5,14 @@ Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/e
 ## [Unreleased rocPRIM-3.0.0 for ROCm 6.1.0]
 ### Added
  - Added new primitive: `block_run_length_decode`.
+ - Added new primitive: `batch_memcpy`.
 ### Changed
 - Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`.
 - Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`.
 - Removed support for custom config types for device algorithms.
 - `host_warp_size()` was moved into `rocprim/device/config_types.hpp`, and now uses either a `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter. The return type is `hipError_t`.
 - Added support for __int128_t in `device_radix_sort` and `block_radix_sort`.
+- Improved the performance of `match_any` and `block_histogram` that uses it
 ### Fixed
 - Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage.
 
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 8087b43ff..499291a39 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -128,6 +128,7 @@ add_rocprim_benchmark(benchmark_block_scan.cpp)
 add_rocprim_benchmark(benchmark_block_sort.cpp)
 add_rocprim_benchmark(benchmark_config_dispatch.cpp)
 add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp)
+add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp)
 add_rocprim_benchmark(benchmark_device_binary_search.cpp)
 add_rocprim_benchmark(benchmark_device_histogram.cpp)
 add_rocprim_benchmark(benchmark_device_merge.cpp)
diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp
new file mode 100644
index 000000000..fde851cdd
--- /dev/null
+++ b/benchmark/benchmark_device_batch_memcpy.cpp
@@ -0,0 +1,522 @@
+// MIT License
+//
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "benchmark/benchmark.h"
+#include "benchmark_utils.hpp"
+#include "cmdparser.hpp"
+
+#include <rocprim/rocprim.hpp>
+
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <stdint.h>
+#include <utility>
+#include <vector>
+
+constexpr uint32_t warmup_size   = 5;
+constexpr int32_t  max_size      = 1024 * 1024;
+constexpr int32_t  wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold;
+constexpr int32_t  blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold;
+
+// Used for generating offsets. We generate a permutation map and then derive
+// offsets via a sum scan over the sizes in the order of the permutation. This
+// allows us to keep the order of buffers we pass to batch_memcpy, but still
+// have source and destinations mappings not be the identity function:
+//
+//  batch_memcpy(
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
+//    [3   , 2   , 1   , 2   ]) // size
+//
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d
+// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled!
+//  ───┬─── ─────┬───── ───┬─── ───
+//     └─────────┼─────────┼───┐
+//           ┌───┘     ┌───┘   │ what batch_memcpy does
+//           ▼         ▼       ▼
+//  ─── ─────────── ─────── ───────
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d'
+// └───┴───┴───┴───┴───┴───┴───┴───┘
+template<class T, class S, class RandomGenerator>
+std::vector<T> shuffled_exclusive_scan(const std::vector<S>& input, RandomGenerator& rng)
+{
+    const auto n = input.size();
+    assert(n > 0);
+
+    std::vector<T> result(n);
+    std::vector<T> permute(n);
+
+    std::iota(permute.begin(), permute.end(), 0);
+    std::shuffle(permute.begin(), permute.end(), rng);
+
+    for(T i = 0, sum = 0; i < n; ++i)
+    {
+        result[permute[i]] = sum;
+        sum += input[permute[i]];
+    }
+
+    return result;
+}
+
+using offset_type = size_t;
+
+template<typename ValueType, typename BufferSizeType>
+struct BatchMemcpyData
+{
+    size_t          total_num_elements = 0;
+    ValueType*      d_input            = nullptr;
+    ValueType*      d_output           = nullptr;
+    ValueType**     d_buffer_srcs      = nullptr;
+    ValueType**     d_buffer_dsts      = nullptr;
+    BufferSizeType* d_buffer_sizes     = nullptr;
+
+    BatchMemcpyData()                       = default;
+    BatchMemcpyData(const BatchMemcpyData&) = delete;
+
+    BatchMemcpyData(BatchMemcpyData&& other)
+        : total_num_elements{std::exchange(other.total_num_elements, 0)}
+        , d_input{std::exchange(other.d_input, nullptr)}
+        , d_output{std::exchange(other.d_output, nullptr)}
+        , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}
+        , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}
+        , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)}
+    {}
+
+    BatchMemcpyData& operator=(BatchMemcpyData&& other)
+    {
+        total_num_elements = std::exchange(other.total_num_elements, 0);
+        d_input            = std::exchange(other.d_input, nullptr);
+        d_output           = std::exchange(other.d_output, nullptr);
+        d_buffer_srcs      = std::exchange(other.d_buffer_srcs, nullptr);
+        d_buffer_dsts      = std::exchange(other.d_buffer_dsts, nullptr);
+        d_buffer_sizes     = std::exchange(other.d_buffer_sizes, nullptr);
+        return *this;
+    };
+
+    BatchMemcpyData& operator=(const BatchMemcpyData&) = delete;
+
+    size_t total_num_bytes() const
+    {
+        return total_num_elements * sizeof(ValueType);
+    }
+
+    ~BatchMemcpyData()
+    {
+        HIP_CHECK(hipFree(d_buffer_sizes));
+        HIP_CHECK(hipFree(d_buffer_srcs));
+        HIP_CHECK(hipFree(d_buffer_dsts));
+        HIP_CHECK(hipFree(d_output));
+        HIP_CHECK(hipFree(d_input));
+    }
+};
+
+template<class ValueType, class BufferSizeType>
+BatchMemcpyData<ValueType, BufferSizeType> prepare_data(const int32_t num_tlev_buffers = 1024,
+                                                        const int32_t num_wlev_buffers = 1024,
+                                                        const int32_t num_blev_buffers = 1024)
+{
+    const bool shuffle_buffers = false;
+
+    BatchMemcpyData<ValueType, BufferSizeType> result;
+    const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers;
+
+    constexpr int32_t wlev_min_elems
+        = rocprim::detail::ceiling_div(wlev_min_size, sizeof(ValueType));
+    constexpr int32_t blev_min_elems
+        = rocprim::detail::ceiling_div(blev_min_size, sizeof(ValueType));
+    constexpr int32_t max_elems = max_size / sizeof(ValueType);
+
+    // Generate data
+    std::mt19937_64 rng(std::random_device{}());
+
+    // Number of elements in each buffer.
+    std::vector<BufferSizeType> h_buffer_num_elements(num_buffers);
+
+    auto iter = h_buffer_num_elements.begin();
+
+    iter = generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng);
+    iter = generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng);
+    iter = generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng);
+
+    // Shuffle the sizes so that size classes aren't clustered
+    std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng);
+
+    // Get the byte size of each buffer
+    std::vector<BufferSizeType> h_buffer_num_bytes(num_buffers);
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType);
+    }
+
+    result.total_num_elements
+        = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0});
+
+    // Generate data.
+    std::independent_bits_engine<std::mt19937_64, 64, uint64_t> bits_engine{rng};
+
+    const size_t num_ints
+        = rocprim::detail::ceiling_div(result.total_num_bytes(), sizeof(uint64_t));
+    auto h_input = std::make_unique<unsigned char[]>(num_ints * sizeof(uint64_t));
+
+    std::for_each(reinterpret_cast<uint64_t*>(h_input.get()),
+                  reinterpret_cast<uint64_t*>(h_input.get() + num_ints * sizeof(uint64_t)),
+                  [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; });
+
+    HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes()));
+    HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes()));
+
+    HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*)));
+    HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*)));
+    HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType)));
+
+    // Generate the source and shuffled destination offsets.
+    std::vector<offset_type> src_offsets;
+    std::vector<offset_type> dst_offsets;
+
+    if(shuffle_buffers)
+    {
+        src_offsets = shuffled_exclusive_scan<offset_type>(h_buffer_num_elements, rng);
+        dst_offsets = shuffled_exclusive_scan<offset_type>(h_buffer_num_elements, rng);
+    }
+    else
+    {
+        src_offsets = std::vector<offset_type>(num_buffers);
+        dst_offsets = std::vector<offset_type>(num_buffers);
+
+        // Consecutive offsets (no shuffling).
+        // src/dst offsets first element is 0, so skip that!
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         src_offsets.begin() + 1);
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         dst_offsets.begin() + 1);
+    }
+
+    // Generate the source and destination pointers.
+    std::vector<ValueType*> h_buffer_srcs(num_buffers);
+    std::vector<ValueType*> h_buffer_dsts(num_buffers);
+
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_srcs[i] = result.d_input + src_offsets[i];
+        h_buffer_dsts[i] = result.d_output + dst_offsets[i];
+    }
+
+    // Prepare the batch memcpy.
+    HIP_CHECK(
+        hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_srcs,
+                        h_buffer_srcs.data(),
+                        h_buffer_srcs.size() * sizeof(ValueType*),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_dsts,
+                        h_buffer_dsts.data(),
+                        h_buffer_dsts.size() * sizeof(ValueType*),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_sizes,
+                        h_buffer_num_bytes.data(),
+                        h_buffer_num_bytes.size() * sizeof(BufferSizeType),
+                        hipMemcpyHostToDevice));
+
+    return result;
+}
+
+template<class ValueType, class BufferSizeType>
+void run_benchmark(benchmark::State& state,
+                   hipStream_t       stream,
+                   const int32_t     num_tlev_buffers = 1024,
+                   const int32_t     num_wlev_buffers = 1024,
+                   const int32_t     num_blev_buffers = 1024)
+{
+    const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers;
+
+    size_t                                     temp_storage_bytes = 0;
+    BatchMemcpyData<ValueType, BufferSizeType> data;
+    HIP_CHECK(rocprim::batch_memcpy(nullptr,
+                                    temp_storage_bytes,
+                                    data.d_buffer_srcs,
+                                    data.d_buffer_dsts,
+                                    data.d_buffer_sizes,
+                                    num_buffers));
+
+    void* d_temp_storage = nullptr;
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+
+    data = prepare_data<ValueType, BufferSizeType>(num_tlev_buffers,
+                                                   num_wlev_buffers,
+                                                   num_blev_buffers);
+
+    // Warm-up
+    for(size_t i = 0; i < warmup_size; i++)
+    {
+        HIP_CHECK(rocprim::batch_memcpy(d_temp_storage,
+                                        temp_storage_bytes,
+                                        data.d_buffer_srcs,
+                                        data.d_buffer_dsts,
+                                        data.d_buffer_sizes,
+                                        num_buffers,
+                                        stream));
+    }
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // HIP events creation
+    hipEvent_t start, stop;
+    HIP_CHECK(hipEventCreate(&start));
+    HIP_CHECK(hipEventCreate(&stop));
+
+    for(auto _ : state)
+    {
+        // Record start event
+        HIP_CHECK(hipEventRecord(start, stream));
+
+        HIP_CHECK(rocprim::batch_memcpy(d_temp_storage,
+                                        temp_storage_bytes,
+                                        data.d_buffer_srcs,
+                                        data.d_buffer_dsts,
+                                        data.d_buffer_sizes,
+                                        num_buffers,
+                                        stream));
+
+        // Record stop event and wait until it completes
+        HIP_CHECK(hipEventRecord(stop, stream));
+        HIP_CHECK(hipEventSynchronize(stop));
+
+        float elapsed_mseconds;
+        HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
+        state.SetIterationTime(elapsed_mseconds / 1000);
+    }
+    state.SetBytesProcessed(state.iterations() * data.total_num_bytes());
+    state.SetItemsProcessed(state.iterations() * data.total_num_elements);
+
+    HIP_CHECK(hipFree(d_temp_storage));
+}
+
+// Naive implementation used for comparison
+#define BENCHMARK_BATCH_MEMCPY_NAIVE
+#ifdef BENCHMARK_BATCH_MEMCPY_NAIVE
+
+template<typename OffsetType, int32_t BlockSize>
+__launch_bounds__(BlockSize) __global__
+    void naive_kernel(void** in_ptr, void** out_ptr, const OffsetType* sizes)
+{
+    using underlying_type              = unsigned char;
+    constexpr int32_t items_per_thread = 4;
+    constexpr int32_t tile_size        = items_per_thread * BlockSize;
+
+    const int32_t buffer_id = rocprim::flat_block_id();
+    auto          in        = reinterpret_cast<underlying_type*>(in_ptr[buffer_id]);
+    auto          out       = reinterpret_cast<underlying_type*>(out_ptr[buffer_id]);
+
+    const auto size             = sizes[buffer_id];
+    const auto size_in_elements = size / sizeof(underlying_type);
+    const auto tiles            = size_in_elements / tile_size;
+
+    auto num_items_to_copy = size;
+
+    for(size_t i = 0; i < tiles; ++i)
+    {
+        underlying_type data[items_per_thread];
+        rocprim::block_load_direct_blocked(rocprim::flat_block_thread_id(),
+                                           in,
+                                           data,
+                                           num_items_to_copy);
+        rocprim::block_store_direct_blocked(rocprim::flat_block_thread_id(),
+                                            out,
+                                            data,
+                                            num_items_to_copy);
+
+        in += tile_size;
+        out += tile_size;
+        num_items_to_copy -= tile_size;
+    }
+}
+
+template<class ValueType, class BufferSizeType>
+void run_naive_benchmark(benchmark::State& state,
+                         hipStream_t       stream,
+                         const int32_t     num_tlev_buffers = 1024,
+                         const int32_t     num_wlev_buffers = 1024,
+                         const int32_t     num_blev_buffers = 1024)
+{
+    const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers;
+
+    const auto data = prepare_data<ValueType, BufferSizeType>(num_tlev_buffers,
+                                                              num_wlev_buffers,
+                                                              num_blev_buffers);
+
+    // Warm-up
+    for(size_t i = 0; i < warmup_size; i++)
+    {
+        naive_kernel<BufferSizeType, 256>
+            <<<num_buffers, 256, 0, stream>>>((void**)data.d_buffer_srcs,
+                                              (void**)data.d_buffer_dsts,
+                                              data.d_buffer_sizes);
+    }
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // HIP events creation
+    hipEvent_t start, stop;
+    HIP_CHECK(hipEventCreate(&start));
+    HIP_CHECK(hipEventCreate(&stop));
+
+    for(auto _ : state)
+    {
+        // Record start event
+        HIP_CHECK(hipEventRecord(start, stream));
+
+        naive_kernel<BufferSizeType, 256>
+            <<<num_buffers, 256, 0, stream>>>((void**)data.d_buffer_srcs,
+                                              (void**)data.d_buffer_dsts,
+                                              data.d_buffer_sizes);
+
+        // Record stop event and wait until it completes
+        HIP_CHECK(hipEventRecord(stop, stream));
+        HIP_CHECK(hipEventSynchronize(stop));
+
+        float elapsed_mseconds;
+        HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
+        state.SetIterationTime(elapsed_mseconds / 1000);
+    }
+    state.SetBytesProcessed(state.iterations() * data.total_num_bytes());
+    state.SetItemsProcessed(state.iterations() * data.total_num_elements);
+}
+
+    #define CREATE_NAIVE_BENCHMARK(item_size,                                                   \
+                                   item_alignment,                                              \
+                                   size_type,                                                   \
+                                   num_tlev,                                                    \
+                                   num_wlev,                                                    \
+                                   num_blev)                                                    \
+        benchmark::RegisterBenchmark(                                                           \
+            bench_naming::format_name(                                                          \
+                "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment          \
+                ",size_type:" #size_type ",algo:naive_memcpy,num_tlev:" #num_tlev               \
+                ",num_wlev:" #num_wlev ",num_blev:" #num_blev ",cfg:default_config}")           \
+                .c_str(),                                                                       \
+            [=](benchmark::State& state)                                                        \
+            {                                                                                   \
+                run_naive_benchmark<custom_aligned_type<item_size, item_alignment>, size_type>( \
+                    state,                                                                      \
+                    stream,                                                                     \
+                    num_tlev,                                                                   \
+                    num_wlev,                                                                   \
+                    num_blev);                                                                  \
+            })
+
+#endif
+
+#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev)      \
+    benchmark::RegisterBenchmark(                                                                 \
+        bench_naming::format_name("{lvl:device,item_size:" #item_size                             \
+                                  ",item_alignment:" #item_alignment ",size_type:" #size_type     \
+                                  ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \
+                                  ",num_blev:" #num_blev ",cfg:default_config}")                  \
+            .c_str(),                                                                             \
+        [=](benchmark::State& state)                                                              \
+        {                                                                                         \
+            run_benchmark<custom_aligned_type<item_size, item_alignment>, size_type>(state,       \
+                                                                                     stream,      \
+                                                                                     num_tlev,    \
+                                                                                     num_wlev,    \
+                                                                                     num_blev);   \
+        })
+
+#ifndef BENCHMARK_BATCH_MEMCPY_NAIVE
+    #define BENCHMARK_TYPE(item_size, item_alignment)                            \
+        CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0),     \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000),   \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000)
+#else
+    #define BENCHMARK_TYPE(item_size, item_alignment)                                  \
+        CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0),           \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0),       \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000),         \
+            CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000),   \
+            CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \
+            CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \
+            CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000),   \
+            CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000)
+#endif
+
+int32_t main(int32_t argc, char* argv[])
+{
+    cli::Parser parser(argc, argv);
+    parser.set_optional<size_t>("size", "size", 1024, "number of values");
+    parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.set_optional<std::string>("name_format",
+                                     "name_format",
+                                     "human",
+                                     "either: json,human,txt");
+
+    parser.run_and_exit_if_error();
+
+    // Parse argv
+    benchmark::Initialize(&argc, argv);
+    const size_t  size   = parser.get<size_t>("size");
+    const int32_t trials = parser.get<int>("trials");
+    bench_naming::set_format(parser.get<std::string>("name_format"));
+
+    // HIP
+    hipStream_t stream = hipStreamDefault; // default
+
+    // Benchmark info
+    add_common_benchmark_info();
+    benchmark::AddCustomContext("size", std::to_string(size));
+
+    // Add benchmarks
+    std::vector<benchmark::internal::Benchmark*> benchmarks;
+
+    benchmarks = {BENCHMARK_TYPE(1, 1),
+                  BENCHMARK_TYPE(1, 2),
+                  BENCHMARK_TYPE(1, 4),
+                  BENCHMARK_TYPE(1, 8),
+                  BENCHMARK_TYPE(2, 2),
+                  BENCHMARK_TYPE(4, 4),
+                  BENCHMARK_TYPE(8, 8)};
+
+    // Use manual timing
+    for(auto& b : benchmarks)
+    {
+        b->UseManualTime();
+        b->Unit(benchmark::kMillisecond);
+    }
+
+    // Force number of iterations
+    if(trials > 0)
+    {
+        for(auto& b : benchmarks)
+        {
+            b->Iterations(trials);
+        }
+    }
+
+    // Run benchmarks
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+}
diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
index 2d0c578f4..fe6b9a077 100644
--- a/benchmark/benchmark_utils.hpp
+++ b/benchmark/benchmark_utils.hpp
@@ -23,6 +23,7 @@
 
 #include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <numeric>
 #include <random>
@@ -115,15 +116,20 @@ struct is_valid_for_int_distribution :
         std::is_same<unsigned long long, T>::value
     > {};
 
+template<typename Iterator>
+using it_value_t = typename std::iterator_traits<Iterator>::value_type;
+
 using engine_type = std::default_random_engine;
 
-// get_random_data() generates only part of sequence and replicates it,
+// generate_random_data_n() generates only part of sequence and replicates it,
 // because benchmarks usually do not need "true" random sequence.
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<rocprim::is_integral<T>::value, std::vector<T>>::type
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(
+    OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024)
+    -> typename std::enable_if_t<rocprim::is_integral<it_value_t<OutputIter>>::value, OutputIter>
 {
-    engine_type gen{std::random_device{}()};
+    using T = it_value_t<OutputIter>;
+
     using dis_type = typename std::conditional<
         is_valid_for_int_distribution<T>::value,
         T,
@@ -132,36 +138,35 @@ inline auto get_random_data(size_t size, U min, V max, size_t max_random_size =
             unsigned int>::type
         >::type;
     std::uniform_int_distribution<dis_type> distribution((T)min, (T)max);
-    std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.begin() + std::min(size, max_random_size),
-        [&]() { return distribution(gen); }
-    );
+    std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); });
     for(size_t i = max_random_size; i < size; i += max_random_size)
     {
-        std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i);
+        std::copy_n(it, std::min(size - i, max_random_size), it + i);
     }
-    return data;
+    return it + size;
 }
 
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<rocprim::is_floating_point<T>::value, std::vector<T>>::type
+template<class OutputIterator, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIterator it,
+                                   size_t         size,
+                                   U              min,
+                                   V              max,
+                                   Generator&     gen,
+                                   size_t         max_random_size = 1024 * 1024)
+    -> std::enable_if_t<rocprim::is_floating_point<it_value_t<OutputIterator>>::value,
+                        OutputIterator>
 {
-    engine_type gen{std::random_device{}()};
+    using T = typename std::iterator_traits<OutputIterator>::value_type;
+
     // Generate floats when T is half
-    using dis_type = typename std::conditional<std::is_same<rocprim::half, T>::value, float, T>::type;
+    using dis_type = std::conditional_t<std::is_same<rocprim::half, T>::value, float, T>;
     std::uniform_real_distribution<dis_type> distribution((dis_type)min, (dis_type)max);
-    std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.begin() + std::min(size, max_random_size),
-        [&]() { return distribution(gen); }
-    );
+    std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); });
     for(size_t i = max_random_size; i < size; i += max_random_size)
     {
-        std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i);
+        std::copy_n(it, std::min(size - i, max_random_size), it + i);
     }
-    return data;
+    return it + size;
 }
 
 template<class T>
@@ -184,7 +189,10 @@ inline std::vector<T> get_random_data01(size_t size, float p, size_t max_random_
 template<class T>
 inline T get_random_value(T min, T max)
 {
-    return get_random_data(1, min, max)[0];
+    T           result;
+    engine_type gen{std::random_device{}()};
+    generate_random_data_n(&result, 1, min, max, gen);
+    return result;
 }
 
 template<class T, class U = T>
@@ -231,33 +239,62 @@ struct is_custom_type : std::false_type {};
 template<class T, class U>
 struct is_custom_type<custom_type<T,U>> : std::true_type {};
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<is_custom_type<T>::value, std::vector<T>>::type
+template<class OutputIterator, class Generator>
+inline auto generate_random_data_n(OutputIterator             it,
+                                   size_t                     size,
+                                   it_value_t<OutputIterator> min,
+                                   it_value_t<OutputIterator> max,
+                                   Generator&                 gen,
+                                   size_t                     max_random_size = 1024 * 1024)
+    -> std::enable_if_t<is_custom_type<it_value_t<OutputIterator>>::value, OutputIterator>
 {
+    using T = it_value_t<OutputIterator>;
+
     using first_type = typename T::first_type;
     using second_type = typename T::second_type;
-    std::vector<T> data(size);
-    auto fdata = get_random_data<first_type>(size, min.x, max.x, max_random_size);
-    auto sdata = get_random_data<second_type>(size, min.y, max.y, max_random_size);
+
+    std::vector<first_type>  fdata(size);
+    std::vector<second_type> sdata(size);
+    generate_random_data_n(fdata.begin(), size, min.x, max.x, gen, max_random_size);
+    generate_random_data_n(sdata.begin(), size, min.y, max.y, gen, max_random_size);
+
     for(size_t i = 0; i < size; i++)
     {
-        data[i] = T(fdata[i], sdata[i]);
+        it[i] = T(fdata[i], sdata[i]);
     }
-    return data;
+    return it + size;
 }
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<!is_custom_type<T>::value && !std::is_same<decltype(max.x), void>::value, std::vector<T>>::type
+template<class OutputIterator, class Generator>
+inline auto generate_random_data_n(OutputIterator             it,
+                                   size_t                     size,
+                                   it_value_t<OutputIterator> min,
+                                   it_value_t<OutputIterator> max,
+                                   Generator&                 gen,
+                                   size_t                     max_random_size = 1024 * 1024)
+    -> std::enable_if_t<!is_custom_type<it_value_t<OutputIterator>>::value
+                            && !std::is_same<decltype(max.x), void>::value,
+                        OutputIterator>
 {
+    using T = it_value_t<OutputIterator>;
+
     using field_type = decltype(max.x);
-    std::vector<T> data(size);
-    auto field_data = get_random_data<field_type>(size, min.x, max.x, max_random_size);
+    std::vector<field_type> field_data(size);
+    generate_random_data_n(field_data.begin(), size, min.x, max.x, gen, max_random_size);
     for(size_t i = 0; i < size; i++)
     {
-        data[i] = T(field_data[i]);
+        it[i] = T(field_data[i]);
     }
+    return it + size;
+}
+
+template<class T, class U, class V>
+inline std::vector<T>
+    get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024)
+{
+    std::vector<T> data(size);
+    engine_type    gen{std::random_device{}()};
+    generate_random_data_n(data.begin(), size, min, max, gen, max_random_size);
     return data;
 }
 
@@ -803,4 +840,10 @@ inline const char* get_block_scan_method_name(rocprim::block_scan_algorithm alg)
     return "unknown_algorithm";
 }
 
+template<std::size_t Size, std::size_t Alignment>
+struct alignas(Alignment) custom_aligned_type
+{
+    unsigned char data[Size];
+};
+
 #endif // ROCPRIM_BENCHMARK_UTILS_HPP_
diff --git a/rocprim/include/rocprim/detail/various.hpp b/rocprim/include/rocprim/detail/various.hpp
index d82e2bce0..48203e672 100644
--- a/rocprim/include/rocprim/detail/various.hpp
+++ b/rocprim/include/rocprim/detail/various.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,8 +24,9 @@
 #include <type_traits>
 
 #include "../config.hpp"
-#include "../types.hpp"
+#include "../functional.hpp"
 #include "../type_traits.hpp"
+#include "../types.hpp"
 
 #include <hip/hip_runtime.h>
 
@@ -34,6 +35,11 @@
     #define __has_include(x) 0
 #endif
 
+// Check for builtins (clang-extension) and fallback
+#ifndef __has_builtin
+    #define __has_builtin(X) 0
+#endif
+
 #if __has_include(<version>) // version is only mandated in c++20
     #include <version>
     #if __cpp_lib_as_const >= 201510L
@@ -338,6 +344,52 @@ ROCPRIM_HOST_DEVICE inline void for_each_in_tuple(::rocprim::tuple<Types...>& t,
     for_each_in_tuple_impl(t, f, ::rocprim::index_sequence_for<Types...>());
 }
 
+/// \brief Reinterprets the pointer as another type and increments it to match the alignment of
+/// the new type.
+///
+/// \tparam DstPtr Destination Type to align to
+/// \tparam Src Type of source pointer
+/// \param pointer The pointer to align
+/// \return Aligned pointer
+template<typename DstPtr, typename Src>
+ROCPRIM_HOST_DEVICE ROCPRIM_INLINE DstPtr cast_align_up(Src* pointer)
+{
+    static_assert(std::is_pointer<DstPtr>::value, "DstPtr must be a pointer type");
+    using Dst = std::remove_pointer_t<DstPtr>;
+#if __has_builtin(__builtin_align_up)
+    return reinterpret_cast<DstPtr>(__builtin_align_up(pointer, alignof(Dst)));
+#else
+    // https://github.com/KabukiStarship/KabukiToolkit/wiki/Fastest-Method-to-Align-Pointers
+    constexpr size_t mask  = alignof(Dst) - 1;
+    auto             value = reinterpret_cast<uintptr_t>(pointer);
+    value += (-value) & mask;
+    return reinterpret_cast<DstPtr>(value);
+#endif
+}
+
+/// \brief Reinterprets the pointer as another type and decrements it to match the alignment of
+/// the new type.
+///
+/// \tparam Ptr Destination Type to align to
+/// \tparam Src Type of source pointer
+/// \param pointer The pointer to align
+/// \return Aligned pointer
+template<typename DstPtr, typename Src>
+ROCPRIM_HOST_DEVICE ROCPRIM_INLINE DstPtr cast_align_down(Src* pointer)
+{
+    static_assert(std::is_pointer<DstPtr>::value, "DstPtr must be a pointer type");
+    using Dst = std::remove_pointer_t<DstPtr>;
+#if __has_builtin(__builtin_align_down)
+    return reinterpret_cast<DstPtr>(__builtin_align_down(pointer, alignof(Dst)));
+#else
+    // https://github.com/KabukiStarship/KabukiToolkit/wiki/Fastest-Method-to-Align-Pointers
+    constexpr size_t mask  = ~(alignof(Dst) - 1);
+    auto             value = reinterpret_cast<uintptr_t>(pointer);
+    value &= mask;
+    return reinterpret_cast<DstPtr>(value);
+#endif
+}
+
 } // end namespace detail
 END_ROCPRIM_NAMESPACE
 
diff --git a/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
new file mode 100644
index 000000000..9c73c9c12
--- /dev/null
+++ b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
@@ -0,0 +1,947 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_BATCH_MEMCPY_HPP_
+#define ROCPRIM_DEVICE_DETAIL_DEVICE_BATCH_MEMCPY_HPP_
+
+#include "rocprim/device/config_types.hpp"
+#include "rocprim/device/detail/device_scan_common.hpp"
+#include "rocprim/device/detail/lookback_scan_state.hpp"
+#include "rocprim/device/device_scan.hpp"
+
+#include "rocprim/block/block_exchange.hpp"
+#include "rocprim/block/block_load.hpp"
+#include "rocprim/block/block_load_func.hpp"
+#include "rocprim/block/block_run_length_decode.hpp"
+#include "rocprim/block/block_scan.hpp"
+#include "rocprim/block/block_store.hpp"
+#include "rocprim/block/block_store_func.hpp"
+
+#include "rocprim/thread/thread_load.hpp"
+#include "rocprim/thread/thread_search.hpp"
+#include "rocprim/thread/thread_store.hpp"
+
+#include "rocprim/detail/temp_storage.hpp"
+#include "rocprim/detail/various.hpp"
+#include "rocprim/functional.hpp"
+#include "rocprim/intrinsics.hpp"
+#include "rocprim/intrinsics/thread.hpp"
+
+#include "rocprim/config.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <stdint.h>
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+namespace batch_memcpy
+{
+enum class size_class
+{
+    tlev = 0,
+    wlev = 1,
+    blev = 2,
+    num_size_classes,
+};
+
+template<uint32_t MaxItemValue, typename BackingUnitType = uint32_t>
+struct counter
+{
+private:
+    static constexpr int32_t num_items = static_cast<int32_t>(size_class::num_size_classes);
+    BackingUnitType          data[num_items];
+
+public:
+    ROCPRIM_DEVICE ROCPRIM_INLINE uint32_t get(size_class index) const
+    {
+        return data[static_cast<uint32_t>(index)];
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE void add(size_class index, uint32_t value)
+    {
+        data[static_cast<uint32_t>(index)] += value;
+    }
+
+    ROCPRIM_DEVICE counter operator+(const counter& other) const
+    {
+        counter result{};
+
+#pragma unroll
+        for(uint32_t i = 0; i < num_items; ++i)
+        {
+            result.data[i] = data[i] + other.data[i];
+        }
+
+        return result;
+    }
+};
+
+template<class Offset>
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static uint8_t read_byte(void* buffer_src, Offset offset)
+{
+    return rocprim::thread_load<rocprim::cache_load_modifier::load_cs>(
+        reinterpret_cast<uint8_t*>(buffer_src) + offset);
+}
+
+template<class Offset>
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void
+    write_byte(void* buffer_dst, Offset offset, uint8_t value)
+{
+    rocprim::thread_store<rocprim::cache_store_modifier::store_cs>(
+        reinterpret_cast<uint8_t*>(buffer_dst) + offset,
+        value);
+}
+
+template<class VectorType>
+struct aligned_ranges
+{
+    VectorType* out_begin;
+    VectorType* out_end;
+
+    const uint8_t* in_begin;
+    const uint8_t* in_end;
+};
+
+/// \brief Gives a a pair of ranges (in_* and out_*) that are contained in in_begin and
+/// out_begin of a given size such that the returned out range aligns with the given vector
+/// type.
+template<class VectorType>
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static aligned_ranges<VectorType>
+               get_aligned_ranges(const void* in_begin, void* out_begin, size_t num_bytes)
+{
+    uint8_t*       out_ptr = static_cast<uint8_t*>(out_begin);
+    const uint8_t* in_ptr  = static_cast<const uint8_t*>(in_begin);
+
+    auto* out_aligned_begin = detail::cast_align_up<VectorType*>(out_ptr);
+    auto* out_aligned_end   = detail::cast_align_down<VectorType*>(out_ptr + num_bytes);
+
+    auto           begin_offset     = reinterpret_cast<uint8_t*>(out_aligned_begin) - out_ptr;
+    auto           end_offset       = reinterpret_cast<uint8_t*>(out_aligned_end) - out_ptr;
+    const uint8_t* in_aligned_begin = in_ptr + begin_offset;
+    const uint8_t* in_aligned_end   = in_ptr + end_offset;
+
+    return aligned_ranges<VectorType>{out_aligned_begin,
+                                      out_aligned_end,
+                                      in_aligned_begin,
+                                      in_aligned_end};
+}
+
+template<class T, class S>
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static T funnel_shift_r(T lo, T hi, S shift)
+{
+    constexpr uint32_t bit_size = sizeof(T) * 8;
+    return (hi << (bit_size - shift)) | lo >> shift;
+}
+
+template<class Offset>
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void* input_buffer,
+                                                                      void*       output_buffer,
+                                                                      Offset      num_bytes,
+                                                                      Offset      offset = 0)
+{
+    using vector_type                      = uint4;
+    constexpr uint32_t ints_in_vector_type = sizeof(uint4) / sizeof(uint32_t);
+
+    constexpr auto warp_size = rocprim::device_warp_size();
+    const auto     rank      = rocprim::detail::block_thread_id<0>() % warp_size;
+
+    const uint8_t* src = reinterpret_cast<const uint8_t*>(input_buffer) + offset;
+    uint8_t*       dst = reinterpret_cast<uint8_t*>(output_buffer) + offset;
+
+    const uint8_t* in_ptr  = src;
+    uint8_t*       out_ptr = dst;
+
+    const auto aligned = batch_memcpy::get_aligned_ranges<vector_type>(src, dst, num_bytes);
+
+    // If no aligned range, copy byte-by-byte and early exit
+    if(aligned.out_end <= aligned.out_begin)
+    {
+        for(uint32_t i = rank; i < num_bytes; i += warp_size)
+        {
+            out_ptr[i] = in_ptr[i];
+        }
+        return;
+    }
+
+    out_ptr += rank;
+    in_ptr += rank;
+
+    // Ensure that all pointers are in aligned range
+    while(out_ptr < reinterpret_cast<uint8_t*>(aligned.out_begin))
+    {
+        *out_ptr = *in_ptr;
+        out_ptr += warp_size;
+        in_ptr += warp_size;
+    }
+
+    // This can be outside the while block since 'warp_size % ints_in_vector_type' always is '0'
+    static_assert(warp_size % ints_in_vector_type == 0, "Warp size is not a multiple of 4");
+
+    in_ptr                          = aligned.in_begin + rank * sizeof(vector_type);
+    const uint32_t  in_offset       = (reinterpret_cast<size_t>(in_ptr) % ints_in_vector_type);
+    vector_type*    aligned_out_ptr = aligned.out_begin + rank;
+    const uint32_t* aligned_in_ptr  = reinterpret_cast<const uint32_t*>(in_ptr - in_offset);
+
+    // Copy elements in aligned range
+    if(in_offset == 0)
+    {
+        // No offset, can do cacheline-aligned to cacheline-aligned copy
+        while(aligned_out_ptr < aligned.out_end)
+        {
+            vector_type data = vector_type{aligned_in_ptr[0],
+                                           aligned_in_ptr[1],
+                                           aligned_in_ptr[2],
+                                           aligned_in_ptr[3]};
+            *aligned_out_ptr = data;
+            aligned_in_ptr += warp_size * sizeof(vector_type) / sizeof(uint32_t);
+            aligned_out_ptr += warp_size;
+        }
+    }
+    else
+    {
+        while(aligned_out_ptr < aligned.out_end)
+        {
+            union
+            {
+                vector_type result;
+                uint32_t    bytes[5];
+            } data;
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < 5; ++i)
+            {
+                data.bytes[i] = aligned_in_ptr[i];
+            }
+
+            // Reads are offset to our cache aligned writes so we need to shift bytes over.
+            // AMD has no intrinsic for funner shift, hence the manual implementation.
+            // Perhaps a better cacheline-aligned to byte-aligned copy method can be used here.
+            const uint32_t shift = in_offset * 8 /* bits per byte */;
+            data.result.x        = funnel_shift_r(data.bytes[0], data.bytes[1], shift);
+            data.result.y        = funnel_shift_r(data.bytes[1], data.bytes[2], shift);
+            data.result.z        = funnel_shift_r(data.bytes[2], data.bytes[3], shift);
+            data.result.w        = funnel_shift_r(data.bytes[3], data.bytes[4], shift);
+
+            *aligned_out_ptr = data.result;
+            aligned_in_ptr += warp_size * sizeof(vector_type) / sizeof(uint32_t);
+            aligned_out_ptr += warp_size;
+        }
+    }
+
+    out_ptr = reinterpret_cast<uint8_t*>(aligned.out_end) + rank;
+    in_ptr  = aligned.in_end + rank;
+
+    // Copy non-aligned tail
+    while(out_ptr < dst + num_bytes)
+    {
+        *out_ptr = *in_ptr;
+        out_ptr += warp_size;
+        in_ptr += warp_size;
+    }
+}
+} // namespace batch_memcpy
+
+template<class Config, class InputBufferItType, class OutputBufferItType, class BufferSizeItType>
+struct batch_memcpy_impl
+{
+    using input_buffer_type  = typename std::iterator_traits<InputBufferItType>::value_type;
+    using output_buffer_type = typename std::iterator_traits<OutputBufferItType>::value_type;
+    using buffer_size_type   = typename std::iterator_traits<BufferSizeItType>::value_type;
+
+    using input_type = typename std::iterator_traits<input_buffer_type>::value_type;
+
+    // top level policy
+    static constexpr uint32_t block_size            = Config::non_blev_block_size;
+    static constexpr uint32_t buffers_per_thread    = Config::non_blev_buffers_per_thread;
+    static constexpr uint32_t tlev_bytes_per_thread = Config::tlev_bytes_per_thread;
+
+    static constexpr uint32_t blev_block_size       = Config::blev_block_size;
+    static constexpr uint32_t blev_bytes_per_thread = Config::blev_bytes_per_thread;
+
+    static constexpr uint32_t wlev_size_threshold = Config::wlev_size_threshold;
+    static constexpr uint32_t blev_size_threshold = Config::blev_size_threshold;
+
+    static constexpr uint32_t tlev_buffers_per_thread = buffers_per_thread;
+    static constexpr uint32_t blev_buffers_per_thread = buffers_per_thread;
+
+    static constexpr uint32_t buffers_per_block = buffers_per_thread * block_size;
+
+    // Offset over buffers.
+    using buffer_offset_type = uint32_t;
+
+    // Offset over tiles.
+    using tile_offset_type = uint32_t;
+
+    // The byte offset within a thread-level buffer. Must fit at least `wlev_size_threshold`.
+    static_assert(wlev_size_threshold < std::numeric_limits<uint16_t>::max(),
+                  "wlev_size_threshhold too large (should fit in 16 bits)");
+    using tlev_byte_offset_type =
+        typename std::conditional<(wlev_size_threshold < 256), uint8_t, uint16_t>::type;
+
+    struct copyable_buffers
+    {
+        InputBufferItType  srcs;
+        OutputBufferItType dsts;
+        BufferSizeItType   sizes;
+    };
+
+    struct copyable_blev_buffers
+    {
+        InputBufferItType  srcs;
+        OutputBufferItType dsts;
+        BufferSizeItType   sizes;
+        tile_offset_type*  offsets;
+    };
+
+private:
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static batch_memcpy::size_class
+                   get_size_class(buffer_size_type size)
+    {
+        auto size_class = batch_memcpy::size_class::tlev;
+        size_class      = size > wlev_size_threshold ? batch_memcpy::size_class::wlev : size_class;
+        size_class      = size > blev_size_threshold ? batch_memcpy::size_class::blev : size_class;
+        return size_class;
+    }
+
+    struct zipped_tlev_byte_assignment
+    {
+        buffer_offset_type    tile_buffer_id;
+        tlev_byte_offset_type buffer_byte_offset;
+    };
+
+    struct buffer_tuple
+    {
+        tlev_byte_offset_type size;
+        buffer_offset_type    buffer_id;
+    };
+
+    using size_class_counter = batch_memcpy::counter<buffers_per_block>;
+
+    using blev_buffer_scan_state_type = rocprim::detail::lookback_scan_state<buffer_offset_type>;
+    using blev_block_scan_state_type  = rocprim::detail::lookback_scan_state<tile_offset_type>;
+
+    using block_size_scan_type            = rocprim::block_scan<size_class_counter, block_size>;
+    using block_blev_tile_count_scan_type = rocprim::block_scan<tile_offset_type, block_size>;
+
+    using block_run_length_decode_type = rocprim::block_run_length_decode<buffer_offset_type,
+                                                                          block_size,
+                                                                          tlev_buffers_per_thread,
+                                                                          tlev_bytes_per_thread>;
+
+    using block_exchange_tlev_type
+        = rocprim::block_exchange<zipped_tlev_byte_assignment, block_size, tlev_bytes_per_thread>;
+
+    using buffer_load_type = rocprim::block_load<buffer_size_type,
+                                                 block_size,
+                                                 buffers_per_thread,
+                                                 rocprim::block_load_method::block_load_striped>;
+
+    using blev_buffer_scan_prefix_callback_type
+        = rocprim::detail::offset_lookback_scan_prefix_op<buffer_offset_type,
+                                                          blev_buffer_scan_state_type,
+                                                          rocprim::plus<buffer_offset_type>>;
+
+    using blev_block_scan_prefix_callback_type
+        = rocprim::detail::offset_lookback_scan_prefix_op<tile_offset_type,
+                                                          blev_block_scan_state_type,
+                                                          rocprim::plus<tile_offset_type>>;
+
+    struct non_blev_memcpy
+    {
+        struct storage
+        {
+            buffer_tuple buffers_by_size_class[buffers_per_block];
+
+            // This value is passed from analysis to prepare_blev.
+            buffer_offset_type blev_buffer_offset;
+
+            union shared_t
+            {
+                union analysis_t
+                {
+                    typename buffer_load_type::storage_type     load_storage;
+                    typename block_size_scan_type::storage_type size_scan_storage;
+                    typename blev_buffer_scan_prefix_callback_type::storage_type
+                        buffer_scan_callback;
+                } analysis;
+
+                struct prepare_blev_t
+                {
+                    typename blev_block_scan_prefix_callback_type::storage_type block_scan_callback;
+                    typename block_blev_tile_count_scan_type::storage_type      block_scan_storage;
+                } prepare_blev;
+
+                struct copy_tlev_t
+                {
+                    typename block_run_length_decode_type::storage_type rld_storage;
+                    typename block_exchange_tlev_type::storage_type     block_exchange_storage;
+                } copy_tlev;
+            } shared;
+        };
+
+        using storage_type = rocprim::detail::raw_storage<storage>;
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE non_blev_memcpy() {}
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE static size_class_counter get_buffer_size_class_histogram(
+            const buffer_size_type (&buffer_sizes)[buffers_per_thread])
+        {
+            size_class_counter counters{};
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < buffers_per_thread; ++i)
+            {
+                auto size_class = get_size_class(buffer_sizes[i]);
+                counters.add(size_class, buffer_sizes[i] > 0 ? 1 : 0);
+            }
+            return counters;
+        }
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE void
+            partition_buffer_by_size(const buffer_size_type (&buffer_sizes)[buffers_per_thread],
+                                     size_class_counter counters,
+                                     buffer_tuple (&buffers_by_size_class)[buffers_per_block])
+        {
+            const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>();
+
+            buffer_offset_type           buffer_id     = flat_block_thread_id;
+            constexpr buffer_offset_type buffer_stride = block_size;
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < buffers_per_thread; ++i, buffer_id += buffer_stride)
+            {
+                if(buffer_sizes[i] <= 0)
+                {
+                    continue;
+                }
+
+                const auto     size_class   = get_size_class(buffer_sizes[i]);
+                const uint32_t write_offset = counters.get(size_class);
+                buffers_by_size_class[write_offset]
+                    = buffer_tuple{static_cast<tlev_byte_offset_type>(buffer_sizes[i]), buffer_id};
+
+                counters.add(size_class, 1);
+            }
+        }
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE void
+            prepare_blev_buffers(typename storage::shared_t::prepare_blev_t& blev_storage,
+                                 buffer_tuple*                               buffer_by_size_class,
+                                 copyable_buffers                            buffers,
+                                 buffer_offset_type                          num_blev_buffers,
+                                 copyable_blev_buffers                       blev_buffers,
+                                 buffer_offset_type                          tile_buffer_offset,
+                                 blev_block_scan_state_type                  blev_block_scan_state,
+                                 buffer_offset_type                          tile_id)
+        {
+            const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>();
+
+            tile_offset_type tile_offsets[blev_buffers_per_thread];
+            auto             blev_buffer_offset = flat_block_thread_id * blev_buffers_per_thread;
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < blev_buffers_per_thread; ++i)
+            {
+                if(blev_buffer_offset < num_blev_buffers)
+                {
+                    auto tile_buffer_id = buffer_by_size_class[blev_buffer_offset].buffer_id;
+                    tile_offsets[i]
+                        = rocprim::detail::ceiling_div(buffers.sizes[tile_buffer_id],
+                                                       blev_block_size * blev_bytes_per_thread);
+                }
+                else
+                {
+                    tile_offsets[i] = 0;
+                }
+                ++blev_buffer_offset;
+            }
+
+            // Convert tile counts into tile offsets.
+            if(tile_id == 0)
+            {
+                tile_offset_type tile_aggregate{};
+                block_blev_tile_count_scan_type{}.exclusive_scan(tile_offsets,
+                                                                 tile_offsets,
+                                                                 tile_aggregate,
+                                                                 tile_aggregate,
+                                                                 blev_storage.block_scan_storage,
+                                                                 rocprim::plus<tile_offset_type>{});
+                if(flat_block_thread_id == 0)
+                {
+                    blev_block_scan_state.set_complete(0, tile_aggregate);
+                }
+            }
+            else
+            {
+                blev_block_scan_prefix_callback_type blev_tile_prefix_op{
+                    tile_id,
+                    blev_block_scan_state,
+                    blev_storage.block_scan_callback,
+                    rocprim::plus<tile_offset_type>{}};
+                block_blev_tile_count_scan_type{}.exclusive_scan(tile_offsets,
+                                                                 tile_offsets,
+                                                                 blev_storage.block_scan_storage,
+                                                                 blev_tile_prefix_op,
+                                                                 rocprim::plus<tile_offset_type>{});
+            }
+            rocprim::syncthreads();
+
+            blev_buffer_offset = flat_block_thread_id * blev_buffers_per_thread;
+
+            // For each buffer this thread processes...
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < blev_buffers_per_thread; ++i, ++blev_buffer_offset)
+            {
+                if(blev_buffer_offset >= num_blev_buffers)
+                {
+                    continue;
+                }
+
+                // If this thread has any blev buffers to process...
+                const auto tile_buffer_id = buffer_by_size_class[blev_buffer_offset].buffer_id;
+                const auto blev_index     = tile_buffer_offset + blev_buffer_offset;
+
+                blev_buffers.srcs[blev_index]    = buffers.srcs[tile_buffer_id];
+                blev_buffers.dsts[blev_index]    = buffers.dsts[tile_buffer_id];
+                blev_buffers.sizes[blev_index]   = buffers.sizes[tile_buffer_id];
+                blev_buffers.offsets[blev_index] = tile_offsets[i];
+            }
+        }
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE void copy_wlev_buffers(buffer_tuple*    buffers_by_size_class,
+                                                             copyable_buffers tile_buffers,
+                                                             buffer_offset_type num_wlev_buffers)
+        {
+            const uint32_t warp_id = rocprim::warp_id();
+            const uint32_t warps_per_block
+                = rocprim::flat_block_size() / rocprim::device_warp_size();
+
+            for(buffer_offset_type buffer_offset = warp_id; buffer_offset < num_wlev_buffers;
+                buffer_offset += warps_per_block)
+            {
+                const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id;
+
+                batch_memcpy::vectorized_copy_bytes(tile_buffers.srcs[buffer_id],
+                                                    tile_buffers.dsts[buffer_id],
+                                                    tile_buffers.sizes[buffer_id]);
+            }
+        }
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE void
+            copy_tlev_buffers(typename storage::shared_t::copy_tlev_t& tlev_storage,
+                              buffer_tuple*                            buffers_by_size_class,
+                              copyable_buffers                         tile_buffers,
+                              buffer_offset_type                       num_tlev_buffers)
+        {
+            const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>();
+
+            buffer_offset_type    tlev_buffer_ids[tlev_buffers_per_thread];
+            tlev_byte_offset_type tlev_buffer_sizes[tlev_buffers_per_thread];
+
+            static_assert(
+                tlev_buffers_per_thread >= buffers_per_thread,
+                "Unsupported configuration: The number of 'thread-level buffers' must be at "
+                "least as large as the number of overall buffers being processed by each "
+                "thread.");
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < tlev_buffers_per_thread; ++i)
+            {
+                tlev_buffer_sizes[i] = 0;
+            }
+
+            uint32_t tlev_buffer_offset = flat_block_thread_id * tlev_buffers_per_thread;
+
+            ROCPRIM_UNROLL
+            for(uint32_t i = 0; i < tlev_buffers_per_thread; ++i)
+            {
+                if(tlev_buffer_offset < num_tlev_buffers)
+                {
+                    const auto buffer_info = buffers_by_size_class[tlev_buffer_offset];
+
+                    tlev_buffer_ids[i]   = buffer_info.buffer_id;
+                    tlev_buffer_sizes[i] = buffer_info.size;
+                }
+                ++tlev_buffer_offset;
+            }
+
+            // Total number of bytes in this block.
+            uint32_t num_total_tlev_bytes = 0;
+
+            block_run_length_decode_type block_run_length_decode{tlev_storage.rld_storage,
+                                                                 tlev_buffer_ids,
+                                                                 tlev_buffer_sizes,
+                                                                 num_total_tlev_bytes};
+
+            // Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated
+            // until we were able to cover all the bytes of TLEV buffers
+            uint32_t decoded_window_offset = 0;
+            while(decoded_window_offset < num_total_tlev_bytes)
+            {
+                buffer_offset_type    buffer_id[tlev_bytes_per_thread];
+                tlev_byte_offset_type buffer_byte_offset[tlev_bytes_per_thread];
+
+                // Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and
+                // buffer_byte_offset[i] that buffer's byte that this thread supposed to copy
+                block_run_length_decode.run_length_decode(buffer_id,
+                                                          buffer_byte_offset,
+                                                          decoded_window_offset);
+
+                // Zip from SoA to AoS
+                zipped_tlev_byte_assignment zipped_byte_assignment[tlev_bytes_per_thread];
+
+                ROCPRIM_UNROLL
+                for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i)
+                {
+                    zipped_byte_assignment[i]
+                        = zipped_tlev_byte_assignment{buffer_id[i], buffer_byte_offset[i]};
+                }
+
+                // Exchange from blocked to striped arrangement for coalesced memory reads and writes
+                block_exchange_tlev_type{}.blocked_to_striped(zipped_byte_assignment,
+                                                              zipped_byte_assignment,
+                                                              tlev_storage.block_exchange_storage);
+
+                // Read in the bytes that this thread is assigned to
+                constexpr auto window_size = tlev_bytes_per_thread * block_size;
+
+                const bool is_full_window
+                    = decoded_window_offset + window_size < num_total_tlev_bytes;
+
+                if(is_full_window)
+                {
+                    uint8_t src_byte[tlev_bytes_per_thread];
+
+                    ROCPRIM_UNROLL
+                    for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i)
+                    {
+                        src_byte[i] = batch_memcpy::read_byte(
+                            tile_buffers.srcs[zipped_byte_assignment[i].tile_buffer_id],
+                            zipped_byte_assignment[i].buffer_byte_offset);
+                    }
+
+                    ROCPRIM_UNROLL
+                    for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i)
+                    {
+                        batch_memcpy::write_byte(
+                            tile_buffers.dsts[zipped_byte_assignment[i].tile_buffer_id],
+                            zipped_byte_assignment[i].buffer_byte_offset,
+                            src_byte[i]);
+                    }
+                }
+                else
+                {
+                    // Read in the bytes that this thread is assigned to
+                    uint32_t absolute_tlev_byte_offset
+                        = decoded_window_offset + flat_block_thread_id;
+                    for(uint32_t i = 0; i < tlev_bytes_per_thread; ++i)
+                    {
+                        if(absolute_tlev_byte_offset < num_total_tlev_bytes)
+                        {
+                            const auto buffer_id     = zipped_byte_assignment[i].tile_buffer_id;
+                            const auto buffer_offset = zipped_byte_assignment[i].buffer_byte_offset;
+
+                            const auto src_byte
+                                = batch_memcpy::read_byte(tile_buffers.srcs[buffer_id],
+                                                          buffer_offset);
+                            batch_memcpy::write_byte(tile_buffers.dsts[buffer_id],
+                                                     buffer_offset,
+                                                     src_byte);
+                        }
+                        absolute_tlev_byte_offset += block_size;
+                    }
+                }
+
+                decoded_window_offset += window_size;
+
+                // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
+                // with next iteration
+                rocprim::syncthreads();
+            }
+        }
+
+        ROCPRIM_DEVICE ROCPRIM_INLINE void copy(storage&                    temp_storage,
+                                                copyable_buffers            buffers,
+                                                uint32_t                    num_buffers,
+                                                copyable_blev_buffers       blev_buffers,
+                                                blev_buffer_scan_state_type blev_buffer_scan_state,
+                                                blev_block_scan_state_type  blev_block_scan_state,
+                                                const buffer_offset_type    tile_id)
+        {
+            const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>();
+
+            // Offset into this tile's buffers
+            const buffer_offset_type buffer_offset = tile_id * buffers_per_block;
+
+            // Indicates whether all of this tile's items are within bounds
+            bool is_full_tile = buffer_offset + buffers_per_block < num_buffers;
+
+            // Load the buffer sizes of this tile's buffers
+            auto tile_buffer_sizes = buffers.sizes + buffer_offset;
+
+            // Sizes of the buffers this thread should work on.
+            buffer_size_type buffer_sizes[buffers_per_thread];
+            if(is_full_tile)
+            {
+                buffer_load_type{}.load(tile_buffer_sizes,
+                                        buffer_sizes,
+                                        temp_storage.shared.analysis.load_storage);
+            }
+            else
+            {
+                buffer_load_type{}.load(tile_buffer_sizes,
+                                        buffer_sizes,
+                                        num_buffers - buffer_offset,
+                                        0,
+                                        temp_storage.shared.analysis.load_storage);
+            }
+
+            // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
+            rocprim::syncthreads();
+
+            // Count how many buffers fall into each size-class
+            auto size_class_histogram = get_buffer_size_class_histogram(buffer_sizes);
+
+            // Prefix sum over size_class_histogram.
+            size_class_counter size_class_agg{};
+            block_size_scan_type{}.exclusive_scan(size_class_histogram /* input     */,
+                                                  size_class_histogram /* output    */,
+                                                  size_class_counter{} /* initial   */,
+                                                  size_class_agg /* aggregate */,
+                                                  temp_storage.shared.analysis.size_scan_storage,
+                                                  rocprim::plus<size_class_counter>{});
+
+            rocprim::syncthreads();
+
+            uint32_t buffer_count = 0;
+
+            // Factor in the per-size-class counts / offsets
+            // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
+            // has to be offset by the TLEV+WLEV buffer count
+            for(const auto size_class : {batch_memcpy::size_class::tlev,
+                                         batch_memcpy::size_class::wlev,
+                                         batch_memcpy::size_class::blev})
+            {
+                size_class_histogram.add(size_class, buffer_count);
+                buffer_count += size_class_agg.get(size_class);
+            }
+
+            // Signal the number of BLEV buffers we're planning to write out
+            // Aggregate the count of blev buffers across threads.
+            buffer_offset_type buffer_exclusive_prefix{};
+            if(tile_id == 0)
+            {
+                if(flat_block_thread_id == 0)
+                {
+                    blev_buffer_scan_state.set_complete(
+                        tile_id,
+                        size_class_agg.get(batch_memcpy::size_class::blev));
+                }
+                buffer_exclusive_prefix = 0;
+            }
+            else
+            {
+                blev_buffer_scan_prefix_callback_type blev_buffer_prefix_op{
+                    tile_id,
+                    blev_buffer_scan_state,
+                    temp_storage.shared.analysis.buffer_scan_callback,
+                    rocprim::plus<buffer_offset_type>{}};
+
+                buffer_exclusive_prefix
+                    = blev_buffer_prefix_op(size_class_agg.get(batch_memcpy::size_class::blev));
+            }
+            if(flat_block_thread_id == 0)
+            {
+                temp_storage.blev_buffer_offset = buffer_exclusive_prefix;
+            }
+
+            rocprim::syncthreads();
+
+            // Write partitions to shared memory.
+            partition_buffer_by_size(buffer_sizes,
+                                     size_class_histogram,
+                                     temp_storage.buffers_by_size_class);
+            rocprim::syncthreads();
+
+            // Get buffers for this tile.
+            copyable_buffers tile_buffer = copyable_buffers{
+                buffers.srcs + buffer_offset,
+                buffers.dsts + buffer_offset,
+                buffers.sizes + buffer_offset,
+            };
+
+            auto num_blev_buffers = size_class_agg.get(batch_memcpy::size_class::blev);
+            auto num_wlev_buffers = size_class_agg.get(batch_memcpy::size_class::wlev);
+            auto num_tlev_buffers = size_class_agg.get(batch_memcpy::size_class::tlev);
+
+            // BLEV buffers are copied in a seperate kernel. We need to prepare global memory
+            // to pass what needs to be copied where that kernel.
+            prepare_blev_buffers(
+                temp_storage.shared.prepare_blev,
+                &temp_storage
+                     .buffers_by_size_class[size_class_agg.get(batch_memcpy::size_class::tlev)
+                                            + size_class_agg.get(batch_memcpy::size_class::wlev)],
+                tile_buffer,
+                num_blev_buffers,
+                blev_buffers,
+                temp_storage.blev_buffer_offset,
+                blev_block_scan_state,
+                tile_id);
+
+            rocprim::syncthreads();
+
+            copy_wlev_buffers(
+                &temp_storage
+                     .buffers_by_size_class[size_class_agg.get(batch_memcpy::size_class::tlev)],
+                tile_buffer,
+                num_wlev_buffers);
+
+            copy_tlev_buffers(temp_storage.shared.copy_tlev,
+                              temp_storage.buffers_by_size_class,
+                              tile_buffer,
+                              num_tlev_buffers);
+        }
+    };
+
+public:
+    __global__ static void init_tile_state_kernel(blev_buffer_scan_state_type buffer_scan_state,
+                                                  blev_block_scan_state_type  block_scan_state,
+                                                  tile_offset_type            num_tiles)
+    {
+        const uint32_t block_id        = rocprim::detail::block_id<0>();
+        const uint32_t block_size      = rocprim::detail::block_size<0>();
+        const uint32_t block_thread_id = rocprim::detail::block_thread_id<0>();
+        const uint32_t flat_thread_id  = (block_id * block_size) + block_thread_id;
+
+        buffer_scan_state.initialize_prefix(flat_thread_id, num_tiles);
+
+        block_scan_state.initialize_prefix(flat_thread_id, num_tiles);
+    }
+
+    __global__ static void
+        non_blev_memcpy_kernel(copyable_buffers            buffers,
+                               buffer_offset_type          num_buffers,
+                               copyable_blev_buffers       blev_buffers,
+                               blev_buffer_scan_state_type blev_buffer_scan_state,
+                               blev_block_scan_state_type  blev_block_scan_state)
+    {
+        ROCPRIM_SHARED_MEMORY typename non_blev_memcpy::storage_type temp_storage;
+        non_blev_memcpy{}.copy(temp_storage.get(),
+                               buffers,
+                               num_buffers,
+                               blev_buffers,
+                               blev_buffer_scan_state,
+                               blev_block_scan_state,
+                               rocprim::flat_block_id());
+    }
+
+    __global__ static void blev_memcpy_kernel(copyable_blev_buffers       blev_buffers,
+                                              blev_buffer_scan_state_type buffer_offset_tile,
+                                              tile_offset_type            last_tile_offset)
+    {
+        const auto flat_block_thread_id = ::rocprim::detail::block_thread_id<0>();
+        const auto flat_block_id        = ::rocprim::detail::block_id<0>();
+        const auto flat_grid_size       = ::rocprim::detail::grid_size<0>();
+
+        constexpr auto blev_tile_size   = blev_block_size * blev_bytes_per_thread;
+        const auto     num_blev_buffers = buffer_offset_tile.get_complete_value(last_tile_offset);
+
+        if(num_blev_buffers == 0)
+        {
+            return;
+        }
+
+        uint32_t tile_id = flat_block_id;
+        while(true)
+        {
+            __shared__ buffer_offset_type shared_buffer_id;
+
+            rocprim::syncthreads();
+
+            if(flat_block_thread_id == 0)
+            {
+                shared_buffer_id
+                    = rocprim::upper_bound(blev_buffers.offsets, num_blev_buffers, tile_id) - 1;
+            }
+
+            rocprim::syncthreads();
+
+            const buffer_offset_type buffer_id = shared_buffer_id;
+
+            // The relative offset of this tile within the buffer it's assigned to
+            const buffer_size_type tile_offset_within_buffer
+                = (tile_id - blev_buffers.offsets[buffer_id]) * blev_tile_size;
+
+            // If the tile has already reached beyond the work of the end of the last buffer
+            if(buffer_id >= num_blev_buffers - 1
+               && tile_offset_within_buffer > blev_buffers.sizes[buffer_id])
+            {
+                return;
+            }
+
+            // Tiny remainders are copied without vectorizing loads
+            if(blev_buffers.sizes[buffer_id] - tile_offset_within_buffer <= 32)
+            {
+                buffer_size_type thread_offset = tile_offset_within_buffer + flat_block_thread_id;
+                for(uint32_t i = 0; i < blev_buffers_per_thread;
+                    ++i, thread_offset += blev_block_size)
+                {
+                    if(thread_offset < blev_buffers.sizes[buffer_id])
+                    {
+                        uint8_t item
+                            = batch_memcpy::read_byte(blev_buffers.srcs[buffer_id], thread_offset);
+                        batch_memcpy::write_byte(blev_buffers.dsts[buffer_id], thread_offset, item);
+                    }
+                }
+                tile_id += flat_grid_size;
+                continue;
+            }
+
+            const buffer_size_type items_to_copy
+                = rocprim::min(static_cast<buffer_size_type>(blev_buffers.sizes[buffer_id]
+                                                             - tile_offset_within_buffer),
+                               static_cast<buffer_size_type>(blev_tile_size));
+
+            batch_memcpy::vectorized_copy_bytes(blev_buffers.srcs[buffer_id],
+                                                blev_buffers.dsts[buffer_id],
+                                                items_to_copy,
+                                                tile_offset_within_buffer);
+
+            tile_id += flat_grid_size;
+        }
+    }
+};
+
+} // namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+#endif
diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
index 9287e5b21..3bb37fda4 100644
--- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
+++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
@@ -36,6 +36,7 @@
 #include "../../detail/various.hpp"
 
 #include "../config_types.hpp"
+#include "rocprim/config.hpp"
 
 extern "C"
 {
@@ -221,6 +222,23 @@ struct lookback_scan_state<T, UseSleep, true>
         value = prefix.value;
     }
 
+    /// \brief Gets the prefix value for a block. Should only be called after all
+    /// blocks/prefixes are completed.
+    ROCPRIM_DEVICE ROCPRIM_INLINE T get_complete_value(const unsigned int block_id)
+    {
+        constexpr unsigned int padding = ::rocprim::device_warp_size();
+
+        auto        p = prefixes[padding + block_id];
+        prefix_type prefix{};
+#ifndef __HIP_CPU_RT__
+        __builtin_memcpy(&prefix, &p, sizeof(prefix_type));
+#else
+        std::memcpy(&prefix, &p, sizeof(prefix_type));
+#endif
+        assert(prefix.flag == PREFIX_COMPLETE);
+        return prefix.value;
+    }
+
 private:
     ROCPRIM_DEVICE ROCPRIM_INLINE
     void set(const unsigned int block_id, const flag_type flag, const T value)
@@ -368,6 +386,16 @@ struct lookback_scan_state<T, UseSleep, false>
             value = prefixes_complete_values[padding + block_id];
     }
 
+    /// \brief Gets the prefix value for a block. Should only be called after all
+    /// blocks/prefixes are completed.
+    ROCPRIM_DEVICE ROCPRIM_INLINE T get_complete_value(const unsigned int block_id)
+    {
+        constexpr unsigned int padding = ::rocprim::device_warp_size();
+
+        assert(prefixes_flags[padding + block_id] == PREFIX_COMPLETE);
+        return prefixes_complete_values[padding + block_id];
+    }
+
 private:
     flag_type * prefixes_flags;
     // We need to separate arrays for partial and final prefixes, because
diff --git a/rocprim/include/rocprim/device/device_memcpy.hpp b/rocprim/include/rocprim/device/device_memcpy.hpp
new file mode 100644
index 000000000..e76a11e54
--- /dev/null
+++ b/rocprim/include/rocprim/device/device_memcpy.hpp
@@ -0,0 +1,346 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_
+#define ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_
+
+#include "../config.hpp"
+#include "../functional.hpp"
+
+#include "config_types.hpp"
+
+#include "detail/device_batch_memcpy.hpp"
+#include "device_memcpy_config.hpp"
+#include "rocprim/device/detail/device_config_helper.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Copy `sizes[i]` bytes from `sources[i]` to `destinations[i]` for all `i` in the range [0, `num_copies`].
+///
+/// \tparam Config [optional] configuration of  the primitive. It has to be \p batch_memcpy_config .
+/// \tparam InputBufferItType type of iterator to source pointers.
+/// \tparam OutputBufferItType type of iterator to desetination pointers.
+/// \tparam BufferSizeItType type of iterator to sizes.
+///
+/// \param [in] temporary_storage pointer to device-accessible temporary storage.
+/// When a null pointer is passed, the required allocation size in bytes is written to
+/// `storage_size` and the function returns without performing the copy.
+/// \param [in, out] storage_size reference to the size in bytes of `temporary_storage`.
+/// \param [in] sources iterator of source pointers.
+/// \param [in] destinations iterator of destination pointers.
+/// \param [in] sizes iterator of range sizes to copy.
+/// \param [in] num_copies number of ranges to copy
+/// \param [in] stream [optional] HIP stream object to enqueue the copy on. Default is `hipStreamDefault`.
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is `false`.
+///
+/// Performs multiple device to device memory copies as a single batched operation.
+/// Roughly equivalent to
+/// \code{.cpp}
+/// for (auto i = 0; i < num_copies; ++i) {
+///     char* src = sources[i];
+///     char* dst = destinations[i];
+///     auto size = sizes[i];
+///     hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice, stream);
+/// }
+/// \endcode
+/// except executed on the device in parallel.
+/// Note that sources and destinations do not have to be part of the same array. I.e. you can copy
+/// from both array A and B to array C and D with a single call to this function.
+/// Source ranges are allowed to overlap,
+/// however, destinations overlapping with either other destinations or with sources is not allowed,
+/// and will result in undefined behaviour.
+///
+/// \par Example
+/// \parblock
+/// In this example multiple sections of data are copied from \p a to \p b .
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp
+///
+/// // Device allocated data:
+/// int* a;             // e.g, [9, 1, 2, 3, 4, 5, 6, 7, 8]
+/// int* b;             // e.g, [0, 0, 0, 0, 0, 0, 0, 0, 0]
+///
+/// // Batch memcpy parameters:
+/// int   num_copies;   // Number of buffers to copy.
+///                      // e.g, 4.
+/// int** sources;       // Pointer to source pointers.
+///                      // e.g, [&a[0], &a[4] &a[7]]
+/// int** destinations;  // Pointer to destination pointers.
+///                      // e.g, [&b[5], &b[2] &b[0]]
+/// int*  sizes;         // Size of buffers to copy.
+///                      // e.g., [3 * sizeof(int), 2 * sizeof(int), 2 * sizeof(int)]
+///
+/// // Calculate the required temporary storage.
+/// size_t temporary_storage_size_bytes;
+/// void* temporary_storage_ptr = nullptr;
+/// rocprim::batch_memcpy(
+///     temporary_storage_ptr,
+///     temporary_storage_size_bytes,
+///     sources,
+///     destinations,
+///     sizes,
+///     num_buffers);
+///
+/// // Allocate temporary storage.
+/// hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // Copy buffers.
+/// rocprim::batch_memcpy(
+///     temporary_storage_ptr,
+///     temporary_storage_size_bytes,
+///     sources,
+///     destinations,
+///     sizes,
+///     num_copies);
+///
+/// // b is now: [7, 8, 4, 5, 0, 9, 1, 2, 0]
+/// //   3rd copy ^--^  ^--^     ^--^--^ 1st copy
+/// //                2nd copy
+/// \endcode
+/// \endparblock
+template<class Config_ = default_config,
+         class InputBufferItType,
+         class OutputBufferItType,
+         class BufferSizeItType>
+ROCPRIM_INLINE static hipError_t batch_memcpy(void*              temporary_storage,
+                                              size_t&            storage_size,
+                                              InputBufferItType  sources,
+                                              OutputBufferItType destinations,
+                                              BufferSizeItType   sizes,
+                                              uint32_t           num_copies,
+                                              hipStream_t        stream = hipStreamDefault,
+                                              bool               debug_synchronous = false)
+{
+    using Config = detail::default_or_custom_config<Config_, batch_memcpy_config<>>;
+
+    static_assert(Config::wlev_size_threshold < Config::blev_size_threshold,
+                  "wlev_size_threshold should be smaller than blev_size_threshold");
+
+    using BufferOffsetType = unsigned int;
+    using BlockOffsetType  = unsigned int;
+
+    hipError_t error = hipSuccess;
+
+    using batch_memcpy_impl_type = detail::
+        batch_memcpy_impl<Config, InputBufferItType, OutputBufferItType, BufferSizeItType>;
+
+    static constexpr uint32_t non_blev_block_size         = Config::non_blev_block_size;
+    static constexpr uint32_t non_blev_buffers_per_thread = Config::non_blev_buffers_per_thread;
+    static constexpr uint32_t blev_block_size             = Config::blev_block_size;
+
+    constexpr uint32_t buffers_per_block = non_blev_block_size * non_blev_buffers_per_thread;
+    const uint32_t     num_blocks = rocprim::detail::ceiling_div(num_copies, buffers_per_block);
+
+    using scan_state_buffer_type = rocprim::detail::lookback_scan_state<BufferOffsetType>;
+    using scan_state_block_type  = rocprim::detail::lookback_scan_state<BlockOffsetType>;
+
+    // Pack buffers
+    typename batch_memcpy_impl_type::copyable_buffers const buffers{
+        sources,
+        destinations,
+        sizes,
+    };
+
+    detail::temp_storage::layout scan_state_buffer_layout{};
+    error = scan_state_buffer_type::get_temp_storage_layout(num_blocks,
+                                                            stream,
+                                                            scan_state_buffer_layout);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    detail::temp_storage::layout blev_block_scan_state_layout{};
+    error = scan_state_block_type::get_temp_storage_layout(num_blocks,
+                                                           stream,
+                                                           blev_block_scan_state_layout);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    uint8_t* blev_buffer_scan_data;
+    uint8_t* blev_block_scan_state_data;
+
+    // The non-blev kernel will prepare blev copy. Communication between the two
+    // kernels is done via `blev_buffers`.
+    typename batch_memcpy_impl_type::copyable_blev_buffers blev_buffers{};
+
+    // Partition `d_temp_storage`.
+    // If `d_temp_storage` is null, calculate the allocation size instead.
+    error = detail::temp_storage::partition(
+        temporary_storage,
+        storage_size,
+        detail::temp_storage::make_linear_partition(
+            detail::temp_storage::ptr_aligned_array(&blev_buffers.srcs, num_copies),
+            detail::temp_storage::ptr_aligned_array(&blev_buffers.dsts, num_copies),
+            detail::temp_storage::ptr_aligned_array(&blev_buffers.sizes, num_copies),
+            detail::temp_storage::ptr_aligned_array(&blev_buffers.offsets, num_copies),
+            detail::temp_storage::make_partition(&blev_buffer_scan_data, scan_state_buffer_layout),
+            detail::temp_storage::make_partition(&blev_block_scan_state_data,
+                                                 blev_block_scan_state_layout)));
+
+    // If allocation failed, return error.
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    // Return the storage size.
+    if(temporary_storage == nullptr)
+    {
+        return hipSuccess;
+    }
+
+    // Compute launch parameters.
+
+    int device_id = hipGetStreamDeviceId(stream);
+
+    // Get the number of multiprocessors
+    int multiprocessor_count{};
+    error = hipDeviceGetAttribute(&multiprocessor_count,
+                                  hipDeviceAttributeMultiprocessorCount,
+                                  device_id);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    // `hipOccupancyMaxActiveBlocksPerMultiprocessor` uses the default device.
+    // We need to perserve the current default device id while we change it temporarily
+    // to get the max occupancy on this stream.
+    int previous_device;
+    error = hipGetDevice(&previous_device);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    error = hipSetDevice(device_id);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    int blev_occupancy{};
+    error = hipOccupancyMaxActiveBlocksPerMultiprocessor(&blev_occupancy,
+                                                         batch_memcpy_impl_type::blev_memcpy_kernel,
+                                                         blev_block_size,
+                                                         0 /* dynSharedMemPerBlk */);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    // Restore the default device id to initial state
+    error = hipSetDevice(previous_device);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    constexpr BlockOffsetType init_kernel_threads = 128;
+    const BlockOffsetType     init_kernel_grid_size
+        = rocprim::detail::ceiling_div(num_blocks, init_kernel_threads);
+
+    auto batch_memcpy_blev_grid_size
+        = multiprocessor_count * blev_occupancy * 1 /* subscription factor */;
+
+    BlockOffsetType batch_memcpy_grid_size = num_blocks;
+
+    // Prepare init_scan_states_kernel.
+    scan_state_buffer_type scan_state_buffer{};
+    error = scan_state_buffer_type::create(scan_state_buffer,
+                                           blev_buffer_scan_data,
+                                           num_blocks,
+                                           stream);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    scan_state_block_type scan_state_block{};
+    error = scan_state_block_type::create(scan_state_block,
+                                          blev_block_scan_state_data,
+                                          num_blocks,
+                                          stream);
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+
+    // Launch init_scan_states_kernel.
+    batch_memcpy_impl_type::
+        init_tile_state_kernel<<<init_kernel_grid_size, init_kernel_threads, 0, stream>>>(
+            scan_state_buffer,
+            scan_state_block,
+            num_blocks);
+    error = hipGetLastError();
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+    if(debug_synchronous)
+    {
+        hipStreamSynchronize(stream);
+    }
+
+    // Launch batch_memcpy_non_blev_kernel.
+    batch_memcpy_impl_type::
+        non_blev_memcpy_kernel<<<batch_memcpy_grid_size, non_blev_block_size, 0, stream>>>(
+            buffers,
+            num_copies,
+            blev_buffers,
+            scan_state_buffer,
+            scan_state_block);
+    error = hipGetLastError();
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+    if(debug_synchronous)
+    {
+        hipStreamSynchronize(stream);
+    }
+
+    // Launch batch_memcpy_blev_kernel.
+    batch_memcpy_impl_type::
+        blev_memcpy_kernel<<<batch_memcpy_blev_grid_size, blev_block_size, 0, stream>>>(
+            blev_buffers,
+            scan_state_buffer,
+            batch_memcpy_grid_size - 1);
+    error = hipGetLastError();
+    if(error != hipSuccess)
+    {
+        return error;
+    }
+    if(debug_synchronous)
+    {
+        hipStreamSynchronize(stream);
+    }
+
+    return hipSuccess;
+}
+
+END_ROCPRIM_NAMESPACE
+
+#endif
diff --git a/rocprim/include/rocprim/device/device_memcpy_config.hpp b/rocprim/include/rocprim/device/device_memcpy_config.hpp
new file mode 100644
index 000000000..272b134fc
--- /dev/null
+++ b/rocprim/include/rocprim/device/device_memcpy_config.hpp
@@ -0,0 +1,81 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_MEMCPY_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_MEMCPY_CONFIG_HPP_
+
+#include "config_types.hpp"
+#include "detail/device_config_helper.hpp"
+
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief
+///
+/// \tparam NonBlevBlockSize - number of threads per block for thread- and warp-level copy.
+/// \tparam NonBlevBuffersPerThreaed - number of buffers processed per thread.
+/// \tparam TlevBytesPerThread - number of bytes per thread for thread-level copy.
+/// \tparam BlevBlockSize - number of thread per block for block-level copy.
+/// \tparam BlevBytesPerThread - number of bytes per thread for block-level copy.
+/// \tparam WlevSizeThreshold - minimum size to use warp-level copy instead of thread-level.
+/// \tparam BlevSizeThreshold - minimum size to use block-level copy instead of warp-level.
+template<unsigned int NonBlevBlockSize         = 256,
+         unsigned int NonBlevBuffersPerThreaed = 2,
+         unsigned int TlevBytesPerThread       = 8,
+         unsigned int BlevBlockSize            = 128,
+         unsigned int BlevBytesPerThread       = 32,
+         unsigned int WlevSizeThreshold        = 128,
+         unsigned int BlevSizeThreshold        = 1024>
+struct batch_memcpy_config
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \brief Number of threads per block for thread- and warp-level copy
+    static constexpr unsigned int non_blev_block_size = NonBlevBlockSize;
+
+    /// \brief Number of buffers processed per thread
+    static constexpr unsigned int non_blev_buffers_per_thread = NonBlevBuffersPerThreaed;
+
+    /// \brief Number of bytes per thread for thread-level copy
+    static constexpr unsigned int tlev_bytes_per_thread = TlevBytesPerThread;
+
+    /// \brief Number of thread per block for block-level copy
+    static constexpr unsigned int blev_block_size = BlevBlockSize;
+
+    /// \brief Number of bytes per thread for block-level copy
+    static constexpr unsigned int blev_bytes_per_thread = BlevBytesPerThread;
+
+    /// \brief Minimum size to use warp-level copy instead of thread-level
+    static constexpr unsigned int wlev_size_threshold = WlevSizeThreshold;
+
+    /// \brief Minimum size to use block-level copy instead of warp-level
+    static constexpr unsigned int blev_size_threshold = BlevSizeThreshold;
+
+#endif
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group primitivesmodule_deviceconfigs
+
+#endif
diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp
index 7a25d3cc9..ba192af87 100644
--- a/rocprim/include/rocprim/intrinsics/warp.hpp
+++ b/rocprim/include/rocprim/intrinsics/warp.hpp
@@ -24,6 +24,8 @@
 #include "../config.hpp"
 #include "../types.hpp"
 
+#include <type_traits>
+
 BEGIN_ROCPRIM_NAMESPACE
 
 /// \addtogroup intrinsicsmodule
@@ -142,11 +144,25 @@ ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label, bool
     ROCPRIM_UNROLL
     for(unsigned int bit = 0; bit < LabelBits; ++bit)
     {
-        const auto bit_set = label & (1u << bit);
-        // Create mask of threads which have the same bit set or unset.
-        const auto same_mask = ballot(bit_set);
-        // Remove bits which do not match from the peer mask.
-        peer_mask &= (bit_set ? same_mask : ~same_mask);
+        static constexpr int lane_width = std::numeric_limits<lane_mask_type>::digits;
+        using lane_mask_type_s          = std::make_signed_t<lane_mask_type>;
+        const auto label_signed         = static_cast<lane_mask_type_s>(label);
+
+        // Get all zeros or all ones depending on label's i-th bit.
+        // Moves the bit into the sign position by left shifting, then shifts it into all the bits
+        // by (arithmetic) right shift which does sign-extension.
+        const lane_mask_type_s bit_set
+            = (label_signed << (lane_width - 1 - bit)) >> (lane_width - 1);
+
+        // Remove all lanes from the mask with a bit that differs from ours
+        // - if we have the bit set we keep the lanes that do too so we mask with the result
+        //   of the ballot
+        // - if we don't have it, then we keep the lanes that also don't, so we flip all bits
+        //   in the mask before and-ing.
+        // since bit_set is all ones if we have the bit and all zeros if not, the flipping is
+        // the same as xor-ing with its inverse
+        const lane_mask_type bit_set_mask = ballot(bit_set);
+        peer_mask &= bit_set_mask ^ ~bit_set;
     }
 
     return -lane_mask_type{valid} & peer_mask;
diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp
index 6a2ecabf8..5b3dd0e1d 100644
--- a/rocprim/include/rocprim/rocprim.hpp
+++ b/rocprim/include/rocprim/rocprim.hpp
@@ -53,15 +53,16 @@
 #include "device/device_adjacent_difference.hpp"
 #include "device/device_binary_search.hpp"
 #include "device/device_histogram.hpp"
+#include "device/device_memcpy.hpp"
 #include "device/device_merge.hpp"
 #include "device/device_merge_sort.hpp"
 #include "device/device_partition.hpp"
 #include "device/device_radix_sort.hpp"
-#include "device/device_reduce_by_key.hpp"
 #include "device/device_reduce.hpp"
+#include "device/device_reduce_by_key.hpp"
 #include "device/device_run_length_encode.hpp"
-#include "device/device_scan_by_key.hpp"
 #include "device/device_scan.hpp"
+#include "device/device_scan_by_key.hpp"
 #include "device/device_segmented_radix_sort.hpp"
 #include "device/device_segmented_reduce.hpp"
 #include "device/device_segmented_scan.hpp"
diff --git a/test/extra/test_rocprim_package.cpp b/test/extra/test_rocprim_package.cpp
index d0a2da3f8..6b1fa0703 100644
--- a/test/extra/test_rocprim_package.cpp
+++ b/test/extra/test_rocprim_package.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -26,14 +26,19 @@
 #include <hip/hip_runtime.h>
 #include <rocprim/rocprim.hpp>
 
-#define HIP_CHECK(condition)         \
-  {                                  \
-    hipError_t error = condition;    \
-    if(error != hipSuccess){         \
-        std::cout << error << std::endl; \
-        exit(error); \
-    } \
-  }
+#define HIP_CHECK(condition)                                                                  \
+    do                                                                                        \
+    {                                                                                         \
+        hipError_t error = condition;                                                         \
+        if(error != hipSuccess)                                                               \
+        {                                                                                     \
+            std::cout << "Error " << hipGetErrorName(error) << '(' << error << ')' << ": "    \
+                      << hipGetErrorString(error) << " in " << __func__ << " at " << __FILE__ \
+                      << ':' << __LINE__ << '\n';                                             \
+            exit(error);                                                                      \
+        }                                                                                     \
+    }                                                                                         \
+    while(false)
 
 int main(int, char**)
 {
diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt
index 8db2d5c5b..3fbc675ce 100644
--- a/test/rocprim/CMakeLists.txt
+++ b/test/rocprim/CMakeLists.txt
@@ -241,6 +241,7 @@ add_rocprim_test("rocprim.block_sort_bitonic" test_block_sort_bitonic.cpp)
 add_rocprim_test("rocprim.config_dispatch" test_config_dispatch.cpp)
 add_rocprim_test("rocprim.constant_iterator" test_constant_iterator.cpp)
 add_rocprim_test("rocprim.counting_iterator" test_counting_iterator.cpp)
+add_rocprim_test("rocprim.device_batch_memcpy" test_device_batch_memcpy.cpp)
 add_rocprim_test("rocprim.device_binary_search" test_device_binary_search.cpp)
 add_rocprim_test("rocprim.device_adjacent_difference" test_device_adjacent_difference.cpp)
 add_rocprim_test("rocprim.device_histogram" test_device_histogram.cpp)
diff --git a/test/rocprim/test_device_batch_memcpy.cpp b/test/rocprim/test_device_batch_memcpy.cpp
new file mode 100644
index 000000000..64986a796
--- /dev/null
+++ b/test/rocprim/test_device_batch_memcpy.cpp
@@ -0,0 +1,329 @@
+// MIT License
+//
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "common_test_header.hpp"
+#include "test_utils_assertions.hpp"
+#include "test_utils_custom_test_types.hpp"
+#include "test_utils_data_generation.hpp"
+#include "test_utils_types.hpp"
+
+#include "rocprim/detail/various.hpp"
+#include "rocprim/device/device_memcpy.hpp"
+#include "rocprim/intrinsics/thread.hpp"
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+#include <cstring>
+
+#include <stdint.h>
+
+template<class ValueType,
+         class SizeType,
+         bool     Shuffled   = false,
+         uint32_t NumBuffers = 1024,
+         uint32_t MaxSize    = 4 * 1024>
+struct DeviceBatchMemcpyParams
+{
+    using value_type                      = ValueType;
+    using size_type                       = SizeType;
+    static constexpr bool     shuffled    = Shuffled;
+    static constexpr uint32_t num_buffers = NumBuffers;
+    static constexpr uint32_t max_size    = MaxSize;
+};
+
+template<class Params>
+struct DeviceBatchMemcpyTests : public ::testing::Test
+{
+    using value_type                      = typename Params::value_type;
+    using size_type                       = typename Params::size_type;
+    static constexpr bool     shuffled    = Params::shuffled;
+    static constexpr uint32_t num_buffers = Params::num_buffers;
+    static constexpr uint32_t max_size    = Params::max_size;
+};
+
+typedef ::testing::Types<
+    // Ignore copy/move
+    DeviceBatchMemcpyParams<test_utils::custom_non_copyable_type<uint8_t>, uint32_t, false>,
+    DeviceBatchMemcpyParams<test_utils::custom_non_moveable_type<uint8_t>, uint32_t, false>,
+    DeviceBatchMemcpyParams<test_utils::custom_non_default_type<uint8_t>, uint32_t, false>,
+
+    // Unshuffled inputs and outputs
+    // Variable value_type
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false>,
+    DeviceBatchMemcpyParams<uint32_t, uint32_t, false>,
+    DeviceBatchMemcpyParams<uint64_t, uint32_t, false>,
+    // size_type: uint16_t
+    DeviceBatchMemcpyParams<uint8_t, uint16_t, false, 1024, 1024>,
+    // size_type: int64_t
+    DeviceBatchMemcpyParams<uint8_t, int64_t, false, 1024, 64 * 1024>,
+    DeviceBatchMemcpyParams<uint8_t, int64_t, false, 1024, 128 * 1024>,
+
+    // weird amount of buffers
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 3 * 1023>,
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 3 * 1025>,
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 1024 * 1024, 256>,
+
+    // Shuffled inputs and outputs
+    // Variable value_type
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, true>,
+    DeviceBatchMemcpyParams<uint32_t, uint32_t, true>,
+    DeviceBatchMemcpyParams<uint64_t, uint32_t, true>,
+    // size_type: uint16_t
+    DeviceBatchMemcpyParams<uint8_t, uint16_t, true, 1024, 1024>,
+    // size_type: int64_t
+    DeviceBatchMemcpyParams<uint8_t, int64_t, true, 1024, 64 * 1024>,
+    DeviceBatchMemcpyParams<uint8_t, int64_t, true, 1024, 128 * 1024>>
+    DeviceBatchMemcpyTestsParams;
+
+TYPED_TEST_SUITE(DeviceBatchMemcpyTests, DeviceBatchMemcpyTestsParams);
+
+// Used for generating offsets. We generate a permutation map and then derive
+// offsets via a sum scan over the sizes in the order of the permutation. This
+// allows us to keep the order of buffers we pass to batch_memcpy, but still
+// have source and destinations mappings not be the identity function:
+//
+//  batch_memcpy(
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
+//    [3   , 2   , 1   , 2   ]) // size
+//
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d
+// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled!
+//  ───┬─── ─────┬───── ───┬─── ───
+//     └─────────┼─────────┼───┐
+//           ┌───┘     ┌───┘   │ what batch_memcpy does
+//           ▼         ▼       ▼
+//  ─── ─────────── ─────── ───────
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d'
+// └───┴───┴───┴───┴───┴───┴───┴───┘
+template<class T, class S, class RandomGenerator>
+std::vector<T> shuffled_exclusive_scan(const std::vector<S>& input, RandomGenerator& rng)
+{
+    const size_t n = input.size();
+    assert(n > 0);
+
+    std::vector<T> result(n);
+    std::vector<T> permute(n);
+
+    std::iota(permute.begin(), permute.end(), 0);
+    std::shuffle(permute.begin(), permute.end(), rng);
+
+    T sum = 0;
+    for(size_t i = 0; i < n; ++i)
+    {
+        result[permute[i]] = sum;
+        sum += input[permute[i]];
+    }
+
+    return result;
+}
+
+TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation)
+{
+    using value_type         = typename TestFixture::value_type;
+    using buffer_size_type   = typename TestFixture::size_type;
+    using buffer_offset_type = uint32_t;
+    using byte_offset_type   = size_t;
+
+    constexpr int32_t num_buffers = TestFixture::num_buffers;
+    constexpr int32_t max_size    = TestFixture::max_size;
+    constexpr bool    shuffled    = TestFixture::shuffled;
+
+    constexpr int32_t wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold;
+    constexpr int32_t blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold;
+
+    constexpr int32_t wlev_min_elems
+        = rocprim::detail::ceiling_div(wlev_min_size, sizeof(value_type));
+    constexpr int32_t blev_min_elems
+        = rocprim::detail::ceiling_div(blev_min_size, sizeof(value_type));
+    constexpr int32_t max_elems = max_size / sizeof(value_type);
+
+    constexpr int32_t enabled_size_categories
+        = (blev_min_elems <= max_elems) + (wlev_min_elems <= max_elems) + 1;
+
+    constexpr int32_t num_blev
+        = blev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0;
+    constexpr int32_t num_wlev
+        = wlev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0;
+    constexpr int32_t num_tlev = num_buffers - num_blev - num_wlev;
+
+    // Get random buffer sizes
+    uint32_t seed = 0;
+    SCOPED_TRACE(testing::Message() << "with seed= " << seed);
+    std::mt19937_64 rng{seed};
+
+    std::vector<buffer_size_type> h_buffer_num_elements(num_buffers);
+
+    auto iter = h_buffer_num_elements.begin();
+
+    iter = test_utils::generate_random_data_n(iter, num_tlev, 1, wlev_min_elems - 1, rng);
+    iter = test_utils::generate_random_data_n(iter,
+                                              num_wlev,
+                                              wlev_min_elems,
+                                              blev_min_elems - 1,
+                                              rng);
+    iter = test_utils::generate_random_data_n(iter, num_blev, blev_min_elems, max_elems, rng);
+
+    const byte_offset_type total_num_elements = std::accumulate(h_buffer_num_elements.begin(),
+                                                                h_buffer_num_elements.end(),
+                                                                byte_offset_type{0});
+
+    // Shuffle the sizes so that size classes aren't clustered
+    std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng);
+
+    // Get the byte size of each buffer
+    std::vector<buffer_size_type> h_buffer_num_bytes(num_buffers);
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(value_type);
+    }
+
+    // And the total byte size
+    const byte_offset_type total_num_bytes = total_num_elements * sizeof(value_type);
+
+    // Device pointers
+    value_type*       d_input        = nullptr;
+    value_type*       d_output       = nullptr;
+    value_type**      d_buffer_srcs  = nullptr;
+    value_type**      d_buffer_dsts  = nullptr;
+    buffer_size_type* d_buffer_sizes = nullptr;
+
+    // Calculate temporary storage
+
+    size_t temp_storage_bytes = 0;
+
+    HIP_CHECK(rocprim::batch_memcpy(nullptr,
+                                    temp_storage_bytes,
+                                    d_buffer_srcs,
+                                    d_buffer_dsts,
+                                    d_buffer_sizes,
+                                    num_buffers));
+
+    void* d_temp_storage = nullptr;
+
+    // Allocate memory.
+    HIP_CHECK(hipMalloc(&d_input, total_num_bytes));
+    HIP_CHECK(hipMalloc(&d_output, total_num_bytes));
+
+    HIP_CHECK(hipMalloc(&d_buffer_srcs, num_buffers * sizeof(*d_buffer_srcs)));
+    HIP_CHECK(hipMalloc(&d_buffer_dsts, num_buffers * sizeof(*d_buffer_dsts)));
+    HIP_CHECK(hipMalloc(&d_buffer_sizes, num_buffers * sizeof(*d_buffer_sizes)));
+
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+
+    // Generate data.
+    std::independent_bits_engine<std::mt19937_64, 64, uint64_t> bits_engine{rng};
+
+    const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t));
+    auto         h_input  = std::make_unique<unsigned char[]>(num_ints * sizeof(uint64_t));
+
+    // generate_n for uninitialized memory, pragmatically use placement-new, since there are no
+    // uint64_t objects alive yet in the storage.
+    std::for_each(reinterpret_cast<uint64_t*>(h_input.get()),
+                  reinterpret_cast<uint64_t*>(h_input.get() + num_ints * sizeof(uint64_t)),
+                  [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; });
+
+    // Generate the source and shuffled destination offsets.
+    std::vector<buffer_offset_type> src_offsets;
+    std::vector<buffer_offset_type> dst_offsets;
+
+    if(shuffled)
+    {
+        src_offsets = shuffled_exclusive_scan<buffer_offset_type>(h_buffer_num_elements, rng);
+        dst_offsets = shuffled_exclusive_scan<buffer_offset_type>(h_buffer_num_elements, rng);
+    }
+    else
+    {
+        src_offsets = std::vector<buffer_offset_type>(num_buffers);
+        dst_offsets = std::vector<buffer_offset_type>(num_buffers);
+
+        // Consecutive offsets (no shuffling).
+        // src/dst offsets first element is 0, so skip that!
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         src_offsets.begin() + 1);
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         dst_offsets.begin() + 1);
+    }
+
+    // Generate the source and destination pointers.
+    std::vector<value_type*> h_buffer_srcs(num_buffers);
+    std::vector<value_type*> h_buffer_dsts(num_buffers);
+
+    for(int32_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_srcs[i] = d_input + src_offsets[i];
+        h_buffer_dsts[i] = d_output + dst_offsets[i];
+    }
+
+    // Prepare the batch memcpy.
+    HIP_CHECK(hipMemcpy(d_input, h_input.get(), total_num_bytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_srcs,
+                        h_buffer_srcs.data(),
+                        h_buffer_srcs.size() * sizeof(*d_buffer_srcs),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_dsts,
+                        h_buffer_dsts.data(),
+                        h_buffer_dsts.size() * sizeof(*d_buffer_dsts),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_sizes,
+                        h_buffer_num_bytes.data(),
+                        h_buffer_num_bytes.size() * sizeof(*d_buffer_sizes),
+                        hipMemcpyHostToDevice));
+
+    // Run batched memcpy.
+    HIP_CHECK(rocprim::batch_memcpy(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_buffer_srcs,
+                                    d_buffer_dsts,
+                                    d_buffer_sizes,
+                                    num_buffers,
+                                    hipStreamDefault));
+    // Verify results.
+    auto h_output = std::make_unique<unsigned char[]>(total_num_bytes);
+    HIP_CHECK(hipMemcpy(h_output.get(), d_output, total_num_bytes, hipMemcpyDeviceToHost));
+
+    for(int32_t i = 0; i < num_buffers; ++i)
+    {
+        ASSERT_EQ(std::memcmp(h_input.get() + src_offsets[i] * sizeof(value_type),
+                              h_output.get() + dst_offsets[i] * sizeof(value_type),
+                              h_buffer_num_bytes[i]),
+                  0)
+            << "with index = " << i;
+    }
+
+    HIP_CHECK(hipFree(d_temp_storage));
+    HIP_CHECK(hipFree(d_buffer_sizes));
+    HIP_CHECK(hipFree(d_buffer_dsts));
+    HIP_CHECK(hipFree(d_buffer_srcs));
+    HIP_CHECK(hipFree(d_output));
+    HIP_CHECK(hipFree(d_input));
+}
diff --git a/test/rocprim/test_utils_assertions.hpp b/test/rocprim/test_utils_assertions.hpp
index 3e9ff4e93..45b65583e 100644
--- a/test/rocprim/test_utils_assertions.hpp
+++ b/test/rocprim/test_utils_assertions.hpp
@@ -21,18 +21,24 @@
 #ifndef ROCPRIM_TEST_UTILS_ASSERTIONS_HPP
 #define ROCPRIM_TEST_UTILS_ASSERTIONS_HPP
 
-// Std::memcpy and std::memcmp
-#include <cstring>
-
 #include "test_utils_half.hpp"
 #include "test_utils_bfloat16.hpp"
 #include "test_utils_custom_test_types.hpp"
 
+#include <rocprim/type_traits.hpp>
+
+#include <gtest/gtest.h>
+
+// Std::memcpy and std::memcmp
+#include <cstring>
+#include <vector>
+
 namespace test_utils {
 
 // begin assert_eq
 template<class T>
-bool inline bit_equal(T a, T b){
+bool inline bit_equal(const T& a, const T& b)
+{
     return std::memcmp(&a,  &b, sizeof(T))==0;
 }
 
diff --git a/test/rocprim/test_utils_bfloat16.hpp b/test/rocprim/test_utils_bfloat16.hpp
index 9c3f68a5e..ad3b298f2 100644
--- a/test/rocprim/test_utils_bfloat16.hpp
+++ b/test/rocprim/test_utils_bfloat16.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,8 @@
 #ifndef ROCPRIM_TEST_UTILS_BFLOAT16_HPP
 #define ROCPRIM_TEST_UTILS_BFLOAT16_HPP
 
+#include <rocprim/types.hpp>
+
 namespace test_utils {
 using bfloat16 = rocprim::bfloat16;
 
diff --git a/test/rocprim/test_utils_custom_test_types.hpp b/test/rocprim/test_utils_custom_test_types.hpp
index 5a9810e55..431547271 100644
--- a/test/rocprim/test_utils_custom_test_types.hpp
+++ b/test/rocprim/test_utils_custom_test_types.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,9 @@
 #include "test_utils_half.hpp"
 #include "test_utils_bfloat16.hpp"
 
+#include "rocprim/functional.hpp"
+#include <type_traits>
+
 namespace test_utils {
 
 template<class T>
@@ -123,6 +126,40 @@ struct custom_test_type
     }
 };
 
+template<class T>
+struct custom_non_copyable_type
+{
+    T x;
+
+    custom_non_copyable_type()                                           = default;
+    custom_non_copyable_type(const custom_non_copyable_type&)            = delete;
+    custom_non_copyable_type(custom_non_copyable_type&&)                 = default;
+    ~custom_non_copyable_type()                                          = default;
+    custom_non_copyable_type& operator=(const custom_non_copyable_type&) = delete;
+    custom_non_copyable_type& operator=(custom_non_copyable_type&&)      = default;
+};
+
+template<class T>
+struct custom_non_moveable_type
+{
+    T x;
+
+    custom_non_moveable_type()                                           = default;
+    custom_non_moveable_type(const custom_non_moveable_type&)            = delete;
+    custom_non_moveable_type(custom_non_moveable_type&&)                 = delete;
+    ~custom_non_moveable_type()                                          = default;
+    custom_non_moveable_type& operator=(const custom_non_moveable_type&) = delete;
+    custom_non_moveable_type& operator=(custom_non_moveable_type&&)      = delete;
+};
+
+template<class T>
+struct custom_non_default_type
+{
+    T x;
+
+    custom_non_default_type() = delete;
+};
+
 // Custom type used in tests
 // Loops are prevented from being unrolled due to a compiler bug in ROCm 5.2 for device code
 template<class T, size_t N>
@@ -306,5 +343,35 @@ struct inner_type<custom_test_array_type<T, N>>
 {
     using type = T;
 };
+
+template<class T>
+struct inner_type<custom_non_copyable_type<T>>
+{
+    using type = T;
+};
+
+template<class T>
+struct inner_type<custom_non_moveable_type<T>>
+{
+    using type = T;
+};
+
+template<class T>
+struct inner_type<custom_non_default_type<T>>
+{
+    using type = T;
+};
+
+template<class T>
+struct is_custom_test_type<custom_non_copyable_type<T>> : std::true_type
+{};
+
+template<class T>
+struct is_custom_test_type<custom_non_moveable_type<T>> : std::true_type
+{};
+
+template<class T>
+struct is_custom_test_type<custom_non_default_type<T>> : std::true_type
+{};
 }
 #endif //ROCPRIM_TEST_UTILS_CUSTOM_TEST_TYPES_HPP
diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp
index 74ea08440..315e92d23 100644
--- a/test/rocprim/test_utils_data_generation.hpp
+++ b/test/rocprim/test_utils_data_generation.hpp
@@ -21,18 +21,20 @@
 #ifndef ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP
 #define ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP
 
-// Std::memcpy and std::memcmp
-#include <cstring>
-#include <vector>
+#include "common_test_header.hpp"
+#include "test_utils_bfloat16.hpp"
+#include "test_utils_custom_test_types.hpp"
+#include "test_utils_half.hpp"
 
 #include <rocprim/test_seed.hpp>
 #include <rocprim/type_traits.hpp>
 #include <rocprim/types.hpp>
 
-#include "common_test_header.hpp"
-#include "test_utils_bfloat16.hpp"
-#include "test_utils_custom_test_types.hpp"
-#include "test_utils_half.hpp"
+// Std::memcpy and std::memcmp
+#include <cstring>
+#include <iterator>
+#include <random>
+#include <vector>
 
 namespace test_utils {
 
@@ -182,95 +184,74 @@ void add_special_values(std::vector<T>& source, seed_type seed_value)
     }
 }
 
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) ->
-    typename std::enable_if<std::is_same<T, __int128_t>::value, std::vector<T>>::type
+template<typename Iterator>
+using it_value_t = typename std::iterator_traits<Iterator>::value_type;
+
+template<class OutputIter, class Generator>
+inline OutputIter segmented_generate_n(OutputIter it, size_t size, Generator gen)
 {
-    engine_type gen{seed_value};
+    const size_t segment_size = size / random_data_generation_segments;
+    if(segment_size == 0)
+    {
+        return std::generate_n(it, size, std::move(gen));
+    }
+
+    for(uint32_t segment_index = 0; segment_index < random_data_generation_segments;
+        segment_index++)
+    {
+        if(segment_index % random_data_generation_repeat_strides == 0)
+        {
+            const auto repeated_value = gen();
+            std::fill(it + segment_size * segment_index,
+                      it + segment_size * (segment_index + 1),
+                      repeated_value);
+        }
+        else
+        {
+            std::generate_n(it + segment_size * segment_index, segment_size, gen);
+        }
+    }
+    return it + size;
+}
+
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen)
+    -> std::enable_if_t<std::is_same<it_value_t<OutputIter>, __int128_t>::value, OutputIter>
+{
+    using T = it_value_t<OutputIter>;
+
     using dis_type = typename std::conditional<
         is_valid_for_int_distribution<T>::value,
         T,
         typename std::conditional<std::is_signed<T>::value, int, unsigned int>::type>::type;
     std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min),
                                                          static_cast<dis_type>(max));
-    std::vector<T>                          data(size);
-    size_t                                  segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments;
-            segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = static_cast<T>(distribution(gen));
-                std::fill(data.begin() + segment_size * segment_index,
-                          data.begin() + segment_size * (segment_index + 1),
-                          repeated_value);
-            }
-            else
-            {
-                std::generate(data.begin() + segment_size * segment_index,
-                              data.begin() + segment_size * (segment_index + 1),
-                              [&]() { return static_cast<T>(distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(),
-                      data.end(),
-                      [&]() { return static_cast<T>(distribution(gen)); });
-    }
-    return data;
+
+    return segmented_generate_n(it, size, [&]() { return static_cast<T>(distribution(gen)); });
 }
 
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) ->
-    typename std::enable_if<std::is_same<T, __uint128_t>::value, std::vector<T>>::type
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen)
+    -> std::enable_if_t<std::is_same<it_value_t<OutputIter>, __uint128_t>::value, OutputIter>
 {
-    engine_type gen{seed_value};
+    using T = it_value_t<OutputIter>;
+
     using dis_type = typename std::conditional<
         is_valid_for_int_distribution<T>::value,
         T,
         typename std::conditional<std::is_signed<T>::value, int, unsigned int>::type>::type;
     std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min),
                                                          static_cast<dis_type>(max));
-    std::vector<T>                          data(size);
-    size_t                                  segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments;
-            segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = static_cast<T>(distribution(gen));
-                std::fill(data.begin() + segment_size * segment_index,
-                          data.begin() + segment_size * (segment_index + 1),
-                          repeated_value);
-            }
-            else
-            {
-                std::generate(data.begin() + segment_size * segment_index,
-                              data.begin() + segment_size * (segment_index + 1),
-                              [&]() { return static_cast<T>(distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(),
-                      data.end(),
-                      [&]() { return static_cast<T>(distribution(gen)); });
-    }
-    return data;
+
+    return segmented_generate_n(it, size, [&]() { return static_cast<T>(distribution(gen)); });
 }
 
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
-    -> typename std::enable_if<rocprim::is_integral<T>::value, std::vector<T>>::type
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen)
+    -> std::enable_if_t<rocprim::is_integral<it_value_t<OutputIter>>::value, OutputIter>
 {
-    engine_type gen{seed_value};
+    using T = it_value_t<OutputIter>;
+
     using dis_type = typename std::conditional<
         is_valid_for_int_distribution<T>::value,
         T,
@@ -278,195 +259,121 @@ inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
                                   int,
                                   unsigned int>::type
         >::type;
-    std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min), static_cast<dis_type>(max));
-    std::vector<T> data(size);
-    size_t segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = static_cast<T>(distribution(gen));
-                std::fill(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    repeated_value);
-
-            }
-            else
-            {
-                std::generate(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return static_cast<T>(distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(), data.end(), [&]() { return static_cast<T>(distribution(gen)); });
-    }
-    return data;
+    std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min),
+                                                         static_cast<dis_type>(max));
+
+    return segmented_generate_n(it, size, [&]() { return static_cast<T>(distribution(gen)); });
 }
 
-template<class T, class U, class V>
-inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
-    -> typename std::enable_if<rocprim::is_floating_point<T>::value, std::vector<T>>::type
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, Generator& gen)
+    -> std::enable_if_t<rocprim::is_floating_point<it_value_t<OutputIter>>::value
+                            && !is_custom_test_type<it_value_t<OutputIter>>::value,
+                        OutputIter>
 {
-    engine_type gen{seed_value};
+    using T = it_value_t<OutputIter>;
+
     // Generate floats when T is half or bfloat16
     using dis_type = typename std::conditional<std::is_same<rocprim::half, T>::value || std::is_same<rocprim::bfloat16, T>::value, float, T>::type;
-    std::uniform_real_distribution<dis_type> distribution(static_cast<dis_type>(min), static_cast<dis_type>(max));
-    std::vector<T> data(size);
-    size_t segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = static_cast<T>(distribution(gen));
-                std::fill(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    repeated_value);
-
-            }
-            else
-            {
-                std::generate(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return static_cast<T>(distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(), data.end(), [&]() { return static_cast<T>(distribution(gen)); });
+    std::uniform_real_distribution<dis_type> distribution(static_cast<dis_type>(min),
+                                                          static_cast<dis_type>(max));
 
-    }
-    return data;
+    return segmented_generate_n(it, size, [&]() { return static_cast<T>(distribution(gen)); });
 }
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, seed_type seed_value)
-    -> typename std::enable_if<
-        is_custom_test_type<T>::value && std::is_integral<typename T::value_type>::value,
-        std::vector<T>
-        >::type
+template<class OutputIter, class Generator>
+inline auto generate_random_data_n(OutputIter             it,
+                                   size_t                 size,
+                                   it_value_t<OutputIter> min,
+                                   it_value_t<OutputIter> max,
+                                   Generator&             gen)
+    -> std::enable_if_t<is_custom_test_type<it_value_t<OutputIter>>::value
+                            && std::is_integral<typename it_value_t<OutputIter>::value_type>::value,
+                        OutputIter>
 {
-    engine_type gen(seed_value);
+    using T = it_value_t<OutputIter>;
+
     std::uniform_int_distribution<typename T::value_type> distribution(min.x, max.x);
-    std::vector<T> data(size);
-    size_t segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = T(distribution(gen), distribution(gen));
-                std::fill(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    repeated_value);
-
-            }
-            else
-            {
-                std::generate(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return T(distribution(gen), distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); });
-    }
-    return data;
+
+    return segmented_generate_n(it,
+                                size,
+                                [&]() { return T(distribution(gen), distribution(gen)); });
 }
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, seed_type seed_value)
-    -> typename std::enable_if<
-        is_custom_test_type<T>::value && std::is_floating_point<typename T::value_type>::value,
-        std::vector<T>
-        >::type
+template<class OutputIter, class Generator>
+inline auto generate_random_data_n(OutputIter             it,
+                                   size_t                 size,
+                                   it_value_t<OutputIter> min,
+                                   it_value_t<OutputIter> max,
+                                   Generator&             gen)
+    -> std::enable_if_t<
+        is_custom_test_type<it_value_t<OutputIter>>::value
+            && std::is_floating_point<typename it_value_t<OutputIter>::value_type>::value,
+        OutputIter>
 {
-    engine_type gen(seed_value);
+    using T = typename std::iterator_traits<OutputIter>::value_type;
+
     std::uniform_real_distribution<typename T::value_type> distribution(min.x, max.x);
-    std::vector<T> data(size);
-    size_t segment_size = size / random_data_generation_segments;
-    if(segment_size != 0)
-    {
-        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
-        {
-            if(segment_index % random_data_generation_repeat_strides == 0)
-            {
-                T repeated_value = T(distribution(gen), distribution(gen));
-                std::fill(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    repeated_value);
-
-            }
-            else
-            {
-                std::generate(
-                    data.begin() + segment_size * segment_index,
-                    data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return T(distribution(gen), distribution(gen)); });
-            }
-        }
-    }
-    else
-    {
-        std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); });
-    }
-    return data;
+
+    return segmented_generate_n(it,
+                                size,
+                                [&]() { return T(distribution(gen), distribution(gen)); });
 }
 
-template<class T>
-inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, seed_type seed_value)
-    -> typename std::enable_if<
-        is_custom_test_array_type<T>::value && std::is_integral<typename T::value_type>::value,
-        std::vector<T>
-        >::type
+template<class OutputIter, class Generator>
+inline auto generate_random_data_n(OutputIter                                  it,
+                                   size_t                                      size,
+                                   typename it_value_t<OutputIter>::value_type min,
+                                   typename it_value_t<OutputIter>::value_type max,
+                                   Generator&                                  gen)
+    -> std::enable_if_t<is_custom_test_array_type<it_value_t<OutputIter>>::value
+                            && std::is_integral<typename it_value_t<OutputIter>::value_type>::value,
+                        OutputIter>
 {
-    engine_type gen(seed_value);
+    using T = typename std::iterator_traits<OutputIter>::value_type;
+
     std::uniform_int_distribution<typename T::value_type> distribution(min, max);
+    return std::generate_n(it,
+                           size,
+                           [&]()
+                           {
+                               T result;
+                               for(size_t i = 0; i < T::size; i++)
+                               {
+                                   result.values[i] = distribution(gen);
+                               }
+                               return result;
+                           });
+}
+
+template<class T, class U, class V>
+inline std::vector<T> get_random_data(size_t size, U min, V max, seed_type seed_value)
+{
     std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.end(),
-        [&]()
-        {
-            T result;
-            for(size_t i = 0; i < T::size; i++)
-            {
-                result.values[i] = distribution(gen);
-            }
-            return result;
-        }
-    );
+    engine_type    gen(seed_value);
+    generate_random_data_n(data.begin(), size, min, max, gen);
     return data;
 }
 
 template<class T, class U, class V>
 inline auto get_random_value(U min, V max, seed_type seed_value)
-    -> typename std::enable_if<rocprim::is_arithmetic<T>::value, T>::type
+    -> std::enable_if_t<rocprim::is_arithmetic<T>::value, T>
 {
-    return get_random_data<T>(random_data_generation_segments, min, max, seed_value)[0];
+    T           result;
+    engine_type gen(seed_value);
+    generate_random_data_n(&result, 1, min, max, gen);
+    return result;
 }
 
 template<class T>
-inline auto get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value)
-    -> typename std::enable_if<is_custom_test_type<T>::value || is_custom_test_array_type<T>::value, T>::type
+inline auto
+    get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value)
+        -> std::enable_if_t<is_custom_test_type<T>::value || is_custom_test_array_type<T>::value, T>
 {
-    return get_random_data<typename T::value_type>(random_data_generation_segments, min, max, seed_value)[0];
+    typename T::value_type result;
+    engine_type            gen(seed_value);
+    generate_random_data_n(&result, 1, min, max, gen);
+    return T{result};
 }
 
 template<class T>
diff --git a/test/rocprim/test_utils_half.hpp b/test/rocprim/test_utils_half.hpp
index d2f2918a3..b8eb05435 100644
--- a/test/rocprim/test_utils_half.hpp
+++ b/test/rocprim/test_utils_half.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,8 @@
 #ifndef ROCPRIM_TEST_UTILS_HALF_HPP
 #define ROCPRIM_TEST_UTILS_HALF_HPP
 
+#include <rocprim/types.hpp>
+
 namespace test_utils
 {
 using half = rocprim::half;