Skip to content

Commit

Permalink
[CUDAX] Add copy_bytes and fill_bytes overloads for mdspan (#2932)
Browse files Browse the repository at this point in the history
* Implement copy_bytes for mdspan

* Add final conversion to mdspan and more tests

* mdspan fill_bytes

* Add docs

* Fix issues after rebase

* Help old GCC figure out the types

* Move runtime extents check to a function

* Fix clang and more old GCC fixes
  • Loading branch information
pciolkosz authored Nov 25, 2024
1 parent dc920c9 commit db47d38
Show file tree
Hide file tree
Showing 6 changed files with 253 additions and 13 deletions.
24 changes: 22 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@

#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__type_traits/is_convertible.h>
#include <cuda/std/mdspan>
#include <cuda/std/span>

#include <cuda/experimental/__launch/launch_transform.cuh>

namespace cuda::experimental
{

#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
template <typename _Tp>
concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;

#else
template <typename _Tp, typename = int>
Expand All @@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
int>> = true;

template <typename _Tp>
inline constexpr bool __valid_copy_fill_argument =
inline constexpr bool __valid_1d_copy_fill_argument =
_CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;

#endif

template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
using __as_mdspan_t =
_CUDA_VSTD::mdspan<typename _Decayed::value_type,
typename _Decayed::extents_type,
typename _Decayed::layout_type,
typename _Decayed::accessor_type>;

template <typename _Tp, typename = int>
inline constexpr bool __convertible_to_mdspan = false;

template <typename _Tp>
inline constexpr bool
__convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
true;

template <typename _Tp>
inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;

} // namespace cuda::experimental
#endif //__CUDAX_ALGORITHM_COMMON
86 changes: 84 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
//! Both source and destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
Expand All @@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
__copy_bytes_impl(
Expand All @@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
}

template <typename _Extents, typename _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents = false;

template <typename _IndexType,
_CUDA_VSTD::size_t... _Extents,
typename _OtherIndexType,
_CUDA_VSTD::size_t... _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
_CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
_CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
_CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;

template <typename _SrcExtents, typename _DstExtents>
_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
{
for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
{
if (__src_exts.extent(__i)
!= static_cast<typename _SrcExtents::index_type>(
__dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
{
return false;
}
}
return true;
}

template <typename _SrcElem,
typename _SrcExtents,
typename _SrcLayout,
typename _SrcAccessor,
typename _DstElem,
typename _DstExtents,
typename _DstLayout,
typename _DstAccessor>
void __nd_copy_bytes_impl(stream_ref __stream,
_CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
_CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
{
static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
"Multidimensional copy requires both source and destination extents to be compatible");
static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
"Multidimensional copy requires both source and destination layouts to match");

if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
{
_CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
}

__copy_bytes_impl(__stream,
_CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
_CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
}

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
//! destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
//! It will be synchronous if both destination and copy is located in host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __src_as_arg = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
__nd_copy_bytes_impl(
__stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_COPY
32 changes: 29 additions & 3 deletions cudax/include/cuda/experimental/__algorithm/fill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
//! into one. It can't reside in pagable host memory.
//! Destination needs to either be a `contiguous_range` or launch transform
//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
//! Destination type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
__fill_bytes_impl(__stream,
Expand All @@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
__value);
}

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination
//! type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
auto __dst_mdspan = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);

__fill_bytes_impl(
__stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_FILL
29 changes: 24 additions & 5 deletions cudax/test/algorithm/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
}
}

template <typename Layout = cuda::std::layout_right, typename Extents>
auto make_buffer_for_mdspan(Extents extents, char value = 0)
{
cuda::mr::pinned_memory_resource host_resource;
auto mapping = typename Layout::template mapping<decltype(extents)>{extents};

cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());

memset(buffer.data(), value, buffer.size_bytes());

return buffer;
}

namespace cuda::experimental
{

// Need a type that goes through all launch_transform steps, but is not a contiguous_range
template <typename AsKernelArg = cuda::std::span<int>>
struct weird_buffer
{
const cuda::mr::pinned_memory_resource& resource;
Expand All @@ -57,7 +71,9 @@ struct weird_buffer
: resource(res)
, data((int*) res.allocate(s * sizeof(int)))
, size(s)
{}
{
memset(data, 0, size);
}

~weird_buffer()
{
Expand All @@ -72,22 +88,25 @@ struct weird_buffer
int* data;
std::size_t size;

using __as_kernel_arg = cuda::std::span<int>;
using __as_kernel_arg = AsKernelArg;

operator cuda::std::span<int>()
{
return {data, size};
}

template <typename Extents>
operator cuda::std::mdspan<int, Extents>()
{
return cuda::std::mdspan<int, Extents>{data};
}
};

_CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept
{
return {self.data, self.size};
}
};

static_assert(std::is_same_v<cudax::as_kernel_arg_t<cudax::weird_buffer>, cuda::std::span<int>>);

} // namespace cuda::experimental

#endif // __ALGORITHM_COMMON__
66 changes: 65 additions & 1 deletion cudax/test/algorithm/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#include "common.cuh"

TEST_CASE("Copy", "[data_manipulation]")
TEST_CASE("1d Copy", "[data_manipulation]")
{
cudax::stream _stream;

Expand Down Expand Up @@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]")
CUDAX_REQUIRE(vec[1] == 0xbeef);
}
}

template <typename SrcLayout = cuda::std::layout_right,
typename DstLayout = SrcLayout,
typename SrcExtents,
typename DstExtents>
void test_mdspan_copy_bytes(
cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents())
{
auto src_buffer = make_buffer_for_mdspan<SrcLayout>(src_extents, 1);
auto dst_buffer = make_buffer_for_mdspan<DstLayout>(dst_extents, 0);

cuda::std::mdspan<int, SrcExtents, SrcLayout> src(src_buffer.data(), src_extents);
cuda::std::mdspan<int, DstExtents, DstLayout> dst(dst_buffer.data(), dst_extents);

for (int i = 0; i < static_cast<int>(src.extent(1)); i++)
{
src(0, i) = i;
}

cudax::copy_bytes(stream, std::move(src), dst);
stream.wait();

for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
{
CUDAX_CHECK(dst(0, i) == i);
}
}

TEST_CASE("Mdspan copy", "[data_manipulation]")
{
cudax::stream stream;

SECTION("Different extents")
{
auto static_extents = cuda::std::extents<size_t, 3, 4>();
test_mdspan_copy_bytes(stream, static_extents, static_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, static_extents);

auto dynamic_extents = cuda::std::dextents<size_t, 2>(3, 4);
test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents);
test_mdspan_copy_bytes(stream, static_extents, dynamic_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, dynamic_extents);

auto mixed_extents = cuda::std::extents<int, cuda::std::dynamic_extent, 4>(3);
test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents);
test_mdspan_copy_bytes(stream, mixed_extents, static_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, mixed_extents, static_extents);
}

SECTION("Launch transform")
{
auto mixed_extents =
cuda::std::extents<size_t, 1024, cuda::std::dynamic_extent, 2, cuda::std::dynamic_extent>(1024, 2);
[[maybe_unused]] auto static_extents = cuda::std::extents<size_t, 1024, 1024, 2, 2>();
auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1);
cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};

cudax::copy_bytes(stream, mdspan, buffer);
stream.wait();
CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
}
}
29 changes: 29 additions & 0 deletions cudax/test/algorithm/fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]")
check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
}
}

TEST_CASE("Mdspan Fill", "[data_manipulation]")
{
cudax::stream stream;
{
cuda::std::dextents<size_t, 3> dynamic_extents{1, 2, 3};
auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);

cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
}
{
cuda::std::extents<size_t, 2, cuda::std::dynamic_extent, 4> mixed_extents{1};
auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);

cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
}
{
using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
auto size = cuda::std::layout_left::mapping<static_extents>().required_span_size();
cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);

cudax::fill_bytes(stream, buffer, fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
}
}

0 comments on commit db47d38

Please sign in to comment.