Skip to content

Commit db47d38

Browse files
authored
[CUDAX] Add copy_bytes and fill_bytes overloads for mdspan (#2932)
* Implement copy_bytes for mdspan * Add final conversion to mdspan and more tests * mdspan fill_bytes * Add docs * Fix issues after rebase * Help old GCC figure out the types * Move runtime extents check to a function * Fix clang and more old GCC fixes
1 parent dc920c9 commit db47d38

File tree

6 files changed

+253
-13
lines changed

6 files changed

+253
-13
lines changed

cudax/include/cuda/experimental/__algorithm/common.cuh

+22-2
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,17 @@
2323

2424
#include <cuda/std/__ranges/concepts.h>
2525
#include <cuda/std/__type_traits/is_convertible.h>
26+
#include <cuda/std/mdspan>
2627
#include <cuda/std/span>
2728

2829
#include <cuda/experimental/__launch/launch_transform.cuh>
2930

3031
namespace cuda::experimental
3132
{
33+
3234
#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
3335
template <typename _Tp>
34-
concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
36+
concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
3537

3638
#else
3739
template <typename _Tp, typename = int>
@@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
4547
int>> = true;
4648

4749
template <typename _Tp>
48-
inline constexpr bool __valid_copy_fill_argument =
50+
inline constexpr bool __valid_1d_copy_fill_argument =
4951
_CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;
5052

5153
#endif
5254

55+
template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
56+
using __as_mdspan_t =
57+
_CUDA_VSTD::mdspan<typename _Decayed::value_type,
58+
typename _Decayed::extents_type,
59+
typename _Decayed::layout_type,
60+
typename _Decayed::accessor_type>;
61+
62+
template <typename _Tp, typename = int>
63+
inline constexpr bool __convertible_to_mdspan = false;
64+
65+
template <typename _Tp>
66+
inline constexpr bool
67+
__convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
68+
true;
69+
70+
template <typename _Tp>
71+
inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;
72+
5373
} // namespace cuda::experimental
5474
#endif //__CUDAX_ALGORITHM_COMMON

cudax/include/cuda/experimental/__algorithm/copy.cuh

+84-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
5353

5454
//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
5555
//!
56-
//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
56+
//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
57+
//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
5758
//! Both source and destination type is required to be trivially copyable.
5859
//!
5960
//! This call might be synchronous if either source or destination is pagable host memory.
@@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
6364
//! @param __src Source to copy from
6465
//! @param __dst Destination to copy into
6566
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
66-
_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
67+
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
6768
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
6869
{
6970
__copy_bytes_impl(
@@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
7475
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
7576
}
7677

78+
template <typename _Extents, typename _OtherExtents>
79+
inline constexpr bool __copy_bytes_compatible_extents = false;
80+
81+
template <typename _IndexType,
82+
_CUDA_VSTD::size_t... _Extents,
83+
typename _OtherIndexType,
84+
_CUDA_VSTD::size_t... _OtherExtents>
85+
inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
86+
_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
87+
decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
88+
_CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
89+
_CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
90+
_CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;
91+
92+
template <typename _SrcExtents, typename _DstExtents>
93+
_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
94+
{
95+
for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
96+
{
97+
if (__src_exts.extent(__i)
98+
!= static_cast<typename _SrcExtents::index_type>(
99+
__dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
100+
{
101+
return false;
102+
}
103+
}
104+
return true;
105+
}
106+
107+
template <typename _SrcElem,
108+
typename _SrcExtents,
109+
typename _SrcLayout,
110+
typename _SrcAccessor,
111+
typename _DstElem,
112+
typename _DstExtents,
113+
typename _DstLayout,
114+
typename _DstAccessor>
115+
void __nd_copy_bytes_impl(stream_ref __stream,
116+
_CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
117+
_CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
118+
{
119+
static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
120+
"Multidimensional copy requires both source and destination extents to be compatible");
121+
static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
122+
"Multidimensional copy requires both source and destination layouts to match");
123+
124+
if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
125+
{
126+
_CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
127+
}
128+
129+
__copy_bytes_impl(__stream,
130+
_CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
131+
_CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
132+
}
133+
134+
//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
135+
//!
136+
//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
137+
//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
138+
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
139+
//! destination type is required to be trivially copyable.
140+
//!
141+
//! This call might be synchronous if either source or destination is pagable host memory.
142+
//! It will be synchronous if both destination and copy is located in host memory.
143+
//!
144+
//! @param __stream Stream that the copy should be inserted into
145+
//! @param __src Source to copy from
146+
//! @param __dst Destination to copy into
147+
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
148+
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
149+
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
150+
{
151+
decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
152+
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
153+
decltype(auto) __src_as_arg = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
154+
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
155+
__nd_copy_bytes_impl(
156+
__stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
157+
}
158+
77159
} // namespace cuda::experimental
78160
#endif // __CUDAX_ALGORITHM_COPY

cudax/include/cuda/experimental/__algorithm/fill.cuh

+29-3
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
4242

4343
//! @brief Launches an operation to bytewise fill the memory into the provided stream.
4444
//!
45-
//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
46-
//! into one. It can't reside in pagable host memory.
45+
//! Destination needs to either be a `contiguous_range` or launch transform
46+
//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
4747
//! Destination type is required to be trivially copyable.
4848
//!
49+
//! Destination can't reside in pagable host memory.
50+
//!
4951
//! @param __stream Stream that the copy should be inserted into
5052
//! @param __dst Destination memory to fill
5153
//! @param __value Value to fill into every byte in the destination
5254
_CCCL_TEMPLATE(typename _DstTy)
53-
_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
55+
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
5456
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
5557
{
5658
__fill_bytes_impl(__stream,
@@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
5961
__value);
6062
}
6163

64+
//! @brief Launches an operation to bytewise fill the memory into the provided stream.
65+
//!
66+
//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
67+
//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
68+
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination
69+
//! type is required to be trivially copyable.
70+
//!
71+
//! Destination can't reside in pagable host memory.
72+
//!
73+
//! @param __stream Stream that the copy should be inserted into
74+
//! @param __dst Destination memory to fill
75+
//! @param __value Value to fill into every byte in the destination
76+
_CCCL_TEMPLATE(typename _DstTy)
77+
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
78+
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
79+
{
80+
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
81+
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
82+
auto __dst_mdspan = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);
83+
84+
__fill_bytes_impl(
85+
__stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
86+
}
87+
6288
} // namespace cuda::experimental
6389
#endif // __CUDAX_ALGORITHM_FILL

cudax/test/algorithm/common.cuh

+24-5
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
4343
}
4444
}
4545

46+
template <typename Layout = cuda::std::layout_right, typename Extents>
47+
auto make_buffer_for_mdspan(Extents extents, char value = 0)
48+
{
49+
cuda::mr::pinned_memory_resource host_resource;
50+
auto mapping = typename Layout::template mapping<decltype(extents)>{extents};
51+
52+
cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());
53+
54+
memset(buffer.data(), value, buffer.size_bytes());
55+
56+
return buffer;
57+
}
58+
4659
namespace cuda::experimental
4760
{
4861

4962
// Need a type that goes through all launch_transform steps, but is not a contiguous_range
63+
template <typename AsKernelArg = cuda::std::span<int>>
5064
struct weird_buffer
5165
{
5266
const cuda::mr::pinned_memory_resource& resource;
@@ -57,7 +71,9 @@ struct weird_buffer
5771
: resource(res)
5872
, data((int*) res.allocate(s * sizeof(int)))
5973
, size(s)
60-
{}
74+
{
75+
memset(data, 0, size);
76+
}
6177

6278
~weird_buffer()
6379
{
@@ -72,22 +88,25 @@ struct weird_buffer
7288
int* data;
7389
std::size_t size;
7490

75-
using __as_kernel_arg = cuda::std::span<int>;
91+
using __as_kernel_arg = AsKernelArg;
7692

7793
operator cuda::std::span<int>()
7894
{
7995
return {data, size};
8096
}
97+
98+
template <typename Extents>
99+
operator cuda::std::mdspan<int, Extents>()
100+
{
101+
return cuda::std::mdspan<int, Extents>{data};
102+
}
81103
};
82104

83105
_CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept
84106
{
85107
return {self.data, self.size};
86108
}
87109
};
88-
89-
static_assert(std::is_same_v<cudax::as_kernel_arg_t<cudax::weird_buffer>, cuda::std::span<int>>);
90-
91110
} // namespace cuda::experimental
92111

93112
#endif // __ALGORITHM_COMMON__

cudax/test/algorithm/copy.cu

+65-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
#include "common.cuh"
1212

13-
TEST_CASE("Copy", "[data_manipulation]")
13+
TEST_CASE("1d Copy", "[data_manipulation]")
1414
{
1515
cudax::stream _stream;
1616

@@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]")
103103
CUDAX_REQUIRE(vec[1] == 0xbeef);
104104
}
105105
}
106+
107+
template <typename SrcLayout = cuda::std::layout_right,
108+
typename DstLayout = SrcLayout,
109+
typename SrcExtents,
110+
typename DstExtents>
111+
void test_mdspan_copy_bytes(
112+
cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents())
113+
{
114+
auto src_buffer = make_buffer_for_mdspan<SrcLayout>(src_extents, 1);
115+
auto dst_buffer = make_buffer_for_mdspan<DstLayout>(dst_extents, 0);
116+
117+
cuda::std::mdspan<int, SrcExtents, SrcLayout> src(src_buffer.data(), src_extents);
118+
cuda::std::mdspan<int, DstExtents, DstLayout> dst(dst_buffer.data(), dst_extents);
119+
120+
for (int i = 0; i < static_cast<int>(src.extent(1)); i++)
121+
{
122+
src(0, i) = i;
123+
}
124+
125+
cudax::copy_bytes(stream, std::move(src), dst);
126+
stream.wait();
127+
128+
for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
129+
{
130+
CUDAX_CHECK(dst(0, i) == i);
131+
}
132+
}
133+
134+
TEST_CASE("Mdspan copy", "[data_manipulation]")
135+
{
136+
cudax::stream stream;
137+
138+
SECTION("Different extents")
139+
{
140+
auto static_extents = cuda::std::extents<size_t, 3, 4>();
141+
test_mdspan_copy_bytes(stream, static_extents, static_extents);
142+
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, static_extents);
143+
144+
auto dynamic_extents = cuda::std::dextents<size_t, 2>(3, 4);
145+
test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents);
146+
test_mdspan_copy_bytes(stream, static_extents, dynamic_extents);
147+
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, dynamic_extents);
148+
149+
auto mixed_extents = cuda::std::extents<int, cuda::std::dynamic_extent, 4>(3);
150+
test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents);
151+
test_mdspan_copy_bytes(stream, mixed_extents, static_extents);
152+
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, mixed_extents, static_extents);
153+
}
154+
155+
SECTION("Launch transform")
156+
{
157+
auto mixed_extents =
158+
cuda::std::extents<size_t, 1024, cuda::std::dynamic_extent, 2, cuda::std::dynamic_extent>(1024, 2);
159+
[[maybe_unused]] auto static_extents = cuda::std::extents<size_t, 1024, 1024, 2, 2>();
160+
auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1);
161+
cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
162+
cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
163+
cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};
164+
165+
cudax::copy_bytes(stream, mdspan, buffer);
166+
stream.wait();
167+
CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
168+
}
169+
}

cudax/test/algorithm/fill.cu

+29
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]")
4444
check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
4545
}
4646
}
47+
48+
TEST_CASE("Mdspan Fill", "[data_manipulation]")
49+
{
50+
cudax::stream stream;
51+
{
52+
cuda::std::dextents<size_t, 3> dynamic_extents{1, 2, 3};
53+
auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
54+
cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);
55+
56+
cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
57+
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
58+
}
59+
{
60+
cuda::std::extents<size_t, 2, cuda::std::dynamic_extent, 4> mixed_extents{1};
61+
auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
62+
cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);
63+
64+
cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
65+
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
66+
}
67+
{
68+
using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
69+
auto size = cuda::std::layout_left::mapping<static_extents>().required_span_size();
70+
cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);
71+
72+
cudax::fill_bytes(stream, buffer, fill_byte);
73+
check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
74+
}
75+
}

0 commit comments

Comments
 (0)