[CUDAX] Add initial bits of copy_bytes and fill_bytes (#2608)

NVIDIA · Nov 1, 2024 · b6323ce · b6323ce
1 parent 0b02ae7
commit b6323ce
Show file tree

Hide file tree

Showing 11 changed files with 496 additions and 20 deletions.
diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ALGORITHM_COMMON
+#define __CUDAX_ALGORITHM_COMMON
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/span>
+
+#include <cuda/experimental/__launch/launch_transform.cuh>
+
+namespace cuda::experimental
+{
+#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
+template <typename _Tp>
+concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
+
+#else
+template <typename _Tp, typename = int>
+inline constexpr bool __convertible_to_span = false;
+
+template <typename _Tp>
+inline constexpr bool __convertible_to_span<
+  _Tp,
+  _CUDA_VSTD::enable_if_t<
+    _CUDA_VSTD::is_convertible_v<_Tp, _CUDA_VSTD::span<typename _CUDA_VSTD::decay_t<_Tp>::value_type>>,
+    int>> = true;
+
+template <typename _Tp>
+inline constexpr bool __valid_copy_fill_argument =
+  _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;
+
+#endif
+
+} // namespace cuda::experimental
+#endif //__CUDAX_ALGORITHM_COMMON
diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ALGORITHM_COPY
+#define __CUDAX_ALGORITHM_COPY
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/__concept_macros.h>
+
+#include <cuda/experimental/__algorithm/common.cuh>
+#include <cuda/experimental/__stream/stream_ref.cuh>
+
+namespace cuda::experimental
+{
+
+template <typename _SrcTy, typename _DstTy>
+void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUDA_VSTD::span<_DstTy> __dst)
+{
+  static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Copy destination can't be const");
+  static_assert(_CUDA_VSTD::is_trivially_copyable_v<_SrcTy> && _CUDA_VSTD::is_trivially_copyable_v<_DstTy>);
+
+  if (__src.size_bytes() > __dst.size_bytes())
+  {
+    _CUDA_VSTD::__throw_invalid_argument("Copy destination is too small to fit the source data");
+  }
+
+  // TODO pass copy direction hint once we have span with properties
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemcpyAsync,
+    "Failed to perform a copy",
+    __dst.data(),
+    __src.data(),
+    __src.size_bytes(),
+    cudaMemcpyDefault,
+    __stream.get());
+}
+
+//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
+//!
+//! Both source and destination needs to either be a `contiguous_range` or implicitly
+//! implicitly/launch transform to one.
+//! Both source and destination type is required to be trivially copyable.
+//!
+//! This call might be synchronous if either source or destination is pagable host memory.
+//! It will be synchronous if both destination and copy is located in host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __src Source to copy from
+//! @param __dst Destination to copy into
+_LIBCUDACXX_TEMPLATE(typename _SrcTy, typename _DstTy)
+_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_SrcTy> _LIBCUDACXX_AND __valid_copy_fill_argument<_DstTy>)
+void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
+{
+  __copy_bytes_impl(
+    __stream,
+    _CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_SrcTy>>(
+      detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src)))),
+    _CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_DstTy>>(
+      detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
+}
+
+} // namespace cuda::experimental
+#endif // __CUDAX_ALGORITHM_COPY
diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ALGORITHM_FILL
+#define __CUDAX_ALGORITHM_FILL
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/__concept_macros.h>
+
+#include <cuda/experimental/__algorithm/common.cuh>
+#include <cuda/experimental/__stream/stream_ref.cuh>
+
+namespace cuda::experimental
+{
+
+template <typename _DstTy, ::std::size_t _DstSize>
+void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> __dst, uint8_t __value)
+{
+  static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Fill destination can't be const");
+  static_assert(_CUDA_VSTD::is_trivially_copyable_v<_DstTy>);
+
+  // TODO do a host callback if not device accessible?
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemsetAsync, "Failed to perform a fill", __dst.data(), __value, __dst.size_bytes(), __stream.get());
+}
+
+//! @brief Launches an operation to bytewise fill the memory into the provided stream.
+//!
+//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
+//! into one. It can't reside in pagable host memory.
+//! Destination type is required to be trivially copyable.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __dst Destination memory to fill
+//! @param __value Value to fill into every byte in the destination
+_LIBCUDACXX_TEMPLATE(typename _DstTy)
+_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_DstTy>)
+void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
+{
+  __fill_bytes_impl(__stream,
+                    _CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_DstTy>>(
+                      detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))),
+                    __value);
+}
+
+} // namespace cuda::experimental
+#endif // __CUDAX_ALGORITHM_FILL
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
@@ -100,21 +100,24 @@ private:
 
   //! @brief Causes the buffer to be treated as a span when passed to cudax::launch.
   //! @pre The buffer must have the cuda::mr::device_accessible property.
-  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp>
-  __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
+  template <class _Tp2 = _Tp>
+  _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
-    static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
-                  "The buffer must be device accessible to be passed to `launch`");
+    // TODO add auto synchronization
     return {__self.__get_data(), __self.size()};
   }
 
   //! @brief Causes the buffer to be treated as a span when passed to cudax::launch
   //! @pre The buffer must have the cuda::mr::device_accessible property.
-  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<const _Tp>
+  template <class _Tp2 = _Tp>
+  _CCCL_NODISCARD_FRIEND auto
   __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
-    static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
-                  "The buffer must be device accessible to be passed to `launch`");
+    // TODO add auto synchronization
     return {__self.__get_data(), __self.size()};
   }
 

diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -90,21 +90,21 @@ private:
 
   //! @brief Causes the buffer to be treated as a span when passed to cudax::launch.
   //! @pre The buffer must have the cuda::mr::device_accessible property.
-  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp>
-  __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
+  template <class _Tp2 = _Tp>
+  _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
-    static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
-                  "The buffer must be device accessible to be passed to `launch`");
     return {__self.__get_data(), __self.size()};
   }
 
   //! @brief Causes the buffer to be treated as a span when passed to cudax::launch
   //! @pre The buffer must have the cuda::mr::device_accessible property.
-  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<const _Tp>
-  __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
+  template <class _Tp2 = _Tp>
+  _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
-    static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
-                  "The buffer must be device accessible to be passed to `launch`");
     return {__self.__get_data(), __self.size()};
   }
 

diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
@@ -59,19 +59,29 @@ template <typename _Arg>
 using __launch_transform_result_t = decltype(__fn{}(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));
 
 template <typename _Arg, typename _Enable = void>
-struct __as_kernel_arg
+struct __as_copy_arg
 {
-  using type = _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>;
+  using type = __launch_transform_result_t<_Arg>;
 };
 
+// Copy needs to know if original value is a reference
 template <typename _Arg>
-struct __as_kernel_arg<
-  _Arg,
-  _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg>>
+struct __as_copy_arg<_Arg,
+                     _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg>>
 {
   using type = typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg;
 };
 
+template <typename _Arg>
+using __as_copy_arg_t = typename detail::__as_copy_arg<_Arg>::type;
+
+// While kernel argument can't be a reference
+template <typename _Arg>
+struct __as_kernel_arg
+{
+  using type = _CUDA_VSTD::decay_t<typename __as_copy_arg<_Arg>::type>;
+};
+
 _CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
 } // namespace detail
 

diff --git a/cudax/include/cuda/experimental/algorithm.cuh b/cudax/include/cuda/experimental/algorithm.cuh
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ALGORITHM__
+#define __CUDAX_ALGORITHM__
+
+#include <cuda/experimental/__algorithm/copy.cuh>
+#include <cuda/experimental/__algorithm/fill.cuh>
+
+#endif // __CUDAX_ALGORITHM__
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
@@ -122,6 +122,12 @@ foreach(cn_target IN LISTS cudax_TARGETS)
   cudax_add_catch2_test(test_target green_context ${cn_target}
     green_context/green_ctx_smoke.cu
   )
+
+  cudax_add_catch2_test(test_target algorithm ${cn_target}
+    algorithm/fill.cu
+    algorithm/copy.cu
+  )
+
 endforeach()
 
 # FIXME: Enable MSVC