@@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
53
53
54
54
// ! @brief Launches a bytewise memory copy from source to destination into the provided stream.
55
55
// !
56
- // ! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
56
+ // ! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
57
+ // ! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
57
58
// ! Both source and destination type is required to be trivially copyable.
58
59
// !
59
60
// ! This call might be synchronous if either source or destination is pagable host memory.
@@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
63
64
// ! @param __src Source to copy from
64
65
// ! @param __dst Destination to copy into
65
66
_CCCL_TEMPLATE (typename _SrcTy, typename _DstTy)
66
- _CCCL_REQUIRES (__valid_copy_fill_argument <_SrcTy> _CCCL_AND __valid_copy_fill_argument <_DstTy>)
67
+ _CCCL_REQUIRES (__valid_1d_copy_fill_argument <_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument <_DstTy>)
67
68
void copy_bytes (stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
68
69
{
69
70
__copy_bytes_impl (
@@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
74
75
detail::__launch_transform (__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
75
76
}
76
77
78
+ template <typename _Extents, typename _OtherExtents>
79
+ inline constexpr bool __copy_bytes_compatible_extents = false ;
80
+
81
+ template <typename _IndexType,
82
+ _CUDA_VSTD::size_t ... _Extents,
83
+ typename _OtherIndexType,
84
+ _CUDA_VSTD::size_t ... _OtherExtents>
85
+ inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
86
+ _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
87
+ decltype (_CUDA_VSTD::__detail::__check_compatible_extents(
88
+ _CUDA_VSTD::integral_constant<bool , sizeof ...(_Extents) == sizeof ...(_OtherExtents)>{},
89
+ _CUDA_VSTD::integer_sequence<size_t , _Extents...>{},
90
+ _CUDA_VSTD::integer_sequence<size_t , _OtherExtents...>{}))::value;
91
+
92
+ template <typename _SrcExtents, typename _DstExtents>
93
+ _CCCL_NODISCARD bool __copy_bytes_runtime_extents_match (_SrcExtents __src_exts, _DstExtents __dst_exts)
94
+ {
95
+ for (typename _SrcExtents::rank_type __i = 0 ; __i < __src_exts.rank (); __i++)
96
+ {
97
+ if (__src_exts.extent (__i)
98
+ != static_cast <typename _SrcExtents::index_type>(
99
+ __dst_exts.extent ((static_cast <typename _DstExtents::rank_type>(__i)))))
100
+ {
101
+ return false ;
102
+ }
103
+ }
104
+ return true ;
105
+ }
106
+
107
+ template <typename _SrcElem,
108
+ typename _SrcExtents,
109
+ typename _SrcLayout,
110
+ typename _SrcAccessor,
111
+ typename _DstElem,
112
+ typename _DstExtents,
113
+ typename _DstLayout,
114
+ typename _DstAccessor>
115
+ void __nd_copy_bytes_impl (stream_ref __stream,
116
+ _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
117
+ _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
118
+ {
119
+ static_assert (__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
120
+ " Multidimensional copy requires both source and destination extents to be compatible" );
121
+ static_assert (_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
122
+ " Multidimensional copy requires both source and destination layouts to match" );
123
+
124
+ if (!__copy_bytes_runtime_extents_match (__src.extents (), __dst.extents ()))
125
+ {
126
+ _CUDA_VSTD::__throw_invalid_argument (" Copy destination size differs from the source" );
127
+ }
128
+
129
+ __copy_bytes_impl (__stream,
130
+ _CUDA_VSTD::span (__src.data_handle (), __src.mapping ().required_span_size ()),
131
+ _CUDA_VSTD::span (__dst.data_handle (), __dst.mapping ().required_span_size ()));
132
+ }
133
+
134
+ // ! @brief Launches a bytewise memory copy from source to destination into the provided stream.
135
+ // !
136
+ // ! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
137
+ // ! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
138
+ // ! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
139
+ // ! destination type is required to be trivially copyable.
140
+ // !
141
+ // ! This call might be synchronous if either source or destination is pagable host memory.
142
+ // ! It will be synchronous if both destination and copy is located in host memory.
143
+ // !
144
+ // ! @param __stream Stream that the copy should be inserted into
145
+ // ! @param __src Source to copy from
146
+ // ! @param __dst Destination to copy into
147
+ _CCCL_TEMPLATE (typename _SrcTy, typename _DstTy)
148
+ _CCCL_REQUIRES (__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
149
+ void copy_bytes (stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
150
+ {
151
+ decltype (auto ) __src_transformed = detail::__launch_transform (__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
152
+ decltype (auto ) __dst_transformed = detail::__launch_transform (__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
153
+ decltype (auto ) __src_as_arg = static_cast <detail::__as_copy_arg_t <_SrcTy>>(__src_transformed);
154
+ decltype (auto ) __dst_as_arg = static_cast <detail::__as_copy_arg_t <_DstTy>>(__dst_transformed);
155
+ __nd_copy_bytes_impl (
156
+ __stream, __as_mdspan_t <decltype (__src_as_arg)>(__src_as_arg), __as_mdspan_t <decltype (__dst_as_arg)>(__dst_as_arg));
157
+ }
158
+
77
159
} // namespace cuda::experimental
78
160
#endif // __CUDAX_ALGORITHM_COPY
0 commit comments