diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh new file mode 100644 index 00000000000..9dd891f7b28 --- /dev/null +++ b/cudax/include/cuda/experimental/__algorithm/common.cuh @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ALGORITHM_COMMON +#define __CUDAX_ALGORITHM_COMMON + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental +{ +#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES) +template +concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range>; + +#else +template +inline constexpr bool __convertible_to_span = false; + +template +inline constexpr bool __convertible_to_span< + _Tp, + _CUDA_VSTD::enable_if_t< + _CUDA_VSTD::is_convertible_v<_Tp, _CUDA_VSTD::span::value_type>>, + int>> = true; + +template +inline constexpr bool __valid_copy_fill_argument = + _CUDA_VRANGES::contiguous_range> || __convertible_to_span<_Tp>; + +#endif + +} // namespace cuda::experimental +#endif //__CUDAX_ALGORITHM_COMMON diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh new file mode 100644 index 00000000000..9054bf0ea5e --- /dev/null +++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ALGORITHM_COPY +#define __CUDAX_ALGORITHM_COPY + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +namespace cuda::experimental +{ + +template +void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUDA_VSTD::span<_DstTy> __dst) +{ + static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Copy destination can't be const"); + static_assert(_CUDA_VSTD::is_trivially_copyable_v<_SrcTy> && _CUDA_VSTD::is_trivially_copyable_v<_DstTy>); + + if (__src.size_bytes() > __dst.size_bytes()) + { + _CUDA_VSTD::__throw_invalid_argument("Copy destination is too small to fit the source data"); + } + + // TODO pass copy direction hint once we have span with properties + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "Failed to perform a copy", + __dst.data(), + __src.data(), + __src.size_bytes(), + cudaMemcpyDefault, + __stream.get()); +} + +//! @brief Launches a bytewise memory copy from source to destination into the provided stream. +//! +//! Both source and destination needs to either be a `contiguous_range` or implicitly +//! implicitly/launch transform to one. +//! Both source and destination type is required to be trivially copyable. +//! +//! This call might be synchronous if either source or destination is pagable host memory. +//! It will be synchronous if both destination and copy is located in host memory. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __src Source to copy from +//! @param __dst Destination to copy into +_LIBCUDACXX_TEMPLATE(typename _SrcTy, typename _DstTy) +_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_SrcTy> _LIBCUDACXX_AND __valid_copy_fill_argument<_DstTy>) +void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) +{ + __copy_bytes_impl( + __stream, + _CUDA_VSTD::span(static_cast>( + detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src)))), + _CUDA_VSTD::span(static_cast>( + detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst))))); +} + +} // namespace cuda::experimental +#endif // __CUDAX_ALGORITHM_COPY diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh new file mode 100644 index 00000000000..4fef2777e87 --- /dev/null +++ b/cudax/include/cuda/experimental/__algorithm/fill.cuh @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ALGORITHM_FILL +#define __CUDAX_ALGORITHM_FILL + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +namespace cuda::experimental +{ + +template +void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> __dst, uint8_t __value) +{ + static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Fill destination can't be const"); + static_assert(_CUDA_VSTD::is_trivially_copyable_v<_DstTy>); + + // TODO do a host callback if not device accessible? + _CCCL_TRY_CUDA_API( + ::cudaMemsetAsync, "Failed to perform a fill", __dst.data(), __value, __dst.size_bytes(), __stream.get()); +} + +//! @brief Launches an operation to bytewise fill the memory into the provided stream. +//! +//! Destination needs to either be a `contiguous_range` or implicitly/launch transform +//! into one. It can't reside in pagable host memory. +//! Destination type is required to be trivially copyable. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __dst Destination memory to fill +//! @param __value Value to fill into every byte in the destination +_LIBCUDACXX_TEMPLATE(typename _DstTy) +_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_DstTy>) +void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) +{ + __fill_bytes_impl(__stream, + _CUDA_VSTD::span(static_cast>( + detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))), + __value); +} + +} // namespace cuda::experimental +#endif // __CUDAX_ALGORITHM_FILL diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index e5fc07b2252..d9679c41575 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -100,21 +100,24 @@ private: //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. //! @pre The buffer must have the cuda::mr::device_accessible property. - _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp> - __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept + template + _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)( + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) { - static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>, - "The buffer must be device accessible to be passed to `launch`"); + // TODO add auto synchronization return {__self.__get_data(), __self.size()}; } //! @brief Causes the buffer to be treated as a span when passed to cudax::launch //! @pre The buffer must have the cuda::mr::device_accessible property. - _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span + template + _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span)( + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) { - static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>, - "The buffer must be device accessible to be passed to `launch`"); + // TODO add auto synchronization return {__self.__get_data(), __self.size()}; } diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 221da6a6e0e..c57e4bc5eac 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -90,21 +90,21 @@ private: //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. //! @pre The buffer must have the cuda::mr::device_accessible property. - _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp> - __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept + template + _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)( + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) { - static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>, - "The buffer must be device accessible to be passed to `launch`"); return {__self.__get_data(), __self.size()}; } //! @brief Causes the buffer to be treated as a span when passed to cudax::launch //! @pre The buffer must have the cuda::mr::device_accessible property. - _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span - __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept + template + _CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept + _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span)( + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) { - static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>, - "The buffer must be device accessible to be passed to `launch`"); return {__self.__get_data(), __self.size()}; } diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh index 15ebce61169..284cde01124 100644 --- a/cudax/include/cuda/experimental/__launch/launch_transform.cuh +++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh @@ -59,19 +59,29 @@ template using __launch_transform_result_t = decltype(__fn{}(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>())); template -struct __as_kernel_arg +struct __as_copy_arg { - using type = _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>; + using type = __launch_transform_result_t<_Arg>; }; +// Copy needs to know if original value is a reference template -struct __as_kernel_arg< - _Arg, - _CUDA_VSTD::void_t>::__as_kernel_arg>> +struct __as_copy_arg<_Arg, + _CUDA_VSTD::void_t>::__as_kernel_arg>> { using type = typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg; }; +template +using __as_copy_arg_t = typename detail::__as_copy_arg<_Arg>::type; + +// While kernel argument can't be a reference +template +struct __as_kernel_arg +{ + using type = _CUDA_VSTD::decay_t::type>; +}; + _CCCL_GLOBAL_CONSTANT __fn __launch_transform{}; } // namespace detail diff --git a/cudax/include/cuda/experimental/algorithm.cuh b/cudax/include/cuda/experimental/algorithm.cuh new file mode 100644 index 00000000000..0de41b72012 --- /dev/null +++ b/cudax/include/cuda/experimental/algorithm.cuh @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_ALGORITHM__ +#define __CUDAX_ALGORITHM__ + +#include +#include + +#endif // __CUDAX_ALGORITHM__ diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 7e71e758d52..38f826c0b4e 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -122,6 +122,12 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target green_context ${cn_target} green_context/green_ctx_smoke.cu ) + + cudax_add_catch2_test(test_target algorithm ${cn_target} + algorithm/fill.cu + algorithm/copy.cu + ) + endforeach() # FIXME: Enable MSVC diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh new file mode 100644 index 00000000000..2789a1f4802 --- /dev/null +++ b/cudax/test/algorithm/common.cuh @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __ALGORITHM_COMMON__ +#define __ALGORITHM_COMMON__ + +#include + +#include +#include +#include + +#include +#include + +inline constexpr uint8_t fill_byte = 1; +inline constexpr uint32_t buffer_size = 42; + +inline int get_expected_value(uint8_t pattern_byte) +{ + int result; + memset(&result, pattern_byte, sizeof(int)); + return result; +} + +template +void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t pattern_byte = fill_byte) +{ + int expected = get_expected_value(pattern_byte); + + stream.wait(); + for (int& i : result) + { + CUDAX_REQUIRE(i == expected); + i = 0; + } +} + +namespace cuda::experimental +{ + +// Need a type that goes through all launch_transform steps, but is not a contiguous_range +struct weird_buffer +{ + const cuda::mr::pinned_memory_resource& resource; + int* data; + std::size_t size; + + weird_buffer(const cuda::mr::pinned_memory_resource& res, std::size_t s) + : resource(res) + , data((int*) res.allocate(s * sizeof(int))) + , size(s) + {} + + ~weird_buffer() + { + resource.deallocate(data, size); + } + + weird_buffer(const weird_buffer&) = delete; + weird_buffer(weird_buffer&&) = delete; + + struct transform_result + { + int* data; + std::size_t size; + + using __as_kernel_arg = cuda::std::span; + + operator cuda::std::span() + { + return {data, size}; + } + }; + + _CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept + { + return {self.data, self.size}; + } +}; + +static_assert(std::is_same_v, cuda::std::span>); + +} // namespace cuda::experimental + +#endif // __ALGORITHM_COMMON__ diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu new file mode 100644 index 00000000000..0066f4feba9 --- /dev/null +++ b/cudax/test/algorithm/copy.cu @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include "common.cuh" + +TEST_CASE("Copy", "[data_manipulation]") +{ + cudax::stream _stream; + + SECTION("Device resource") + { + cudax::mr::async_memory_resource device_resource; + std::vector host_vector(buffer_size); + + { + cudax::uninitialized_async_buffer buffer(device_resource, _stream, buffer_size); + cudax::fill_bytes(_stream, buffer, fill_byte); + + cudax::copy_bytes(_stream, buffer, host_vector); + check_result_and_erase(_stream, host_vector); + + cudax::copy_bytes(_stream, std::move(buffer), host_vector); + check_result_and_erase(_stream, host_vector); + } + { + cudax::uninitialized_async_buffer not_yet_const_buffer( + device_resource, _stream, buffer_size); + cudax::fill_bytes(_stream, not_yet_const_buffer, fill_byte); + + const auto& const_buffer = not_yet_const_buffer; + + cudax::copy_bytes(_stream, const_buffer, host_vector); + check_result_and_erase(_stream, host_vector); + + cudax::copy_bytes(_stream, const_buffer, cuda::std::span(host_vector)); + check_result_and_erase(_stream, host_vector); + } + } + + SECTION("Host and managed resource") + { + cuda::mr::managed_memory_resource managed_resource; + cuda::mr::pinned_memory_resource host_resource; + + { + cudax::uninitialized_buffer host_buffer(host_resource, buffer_size); + cudax::uninitialized_buffer device_buffer(managed_resource, buffer_size); + + cudax::fill_bytes(_stream, host_buffer, fill_byte); + + cudax::copy_bytes(_stream, host_buffer, device_buffer); + check_result_and_erase(_stream, device_buffer); + + cudax::copy_bytes(_stream, cuda::std::span(host_buffer), device_buffer); + check_result_and_erase(_stream, device_buffer); + } + + { + cudax::uninitialized_buffer not_yet_const_host_buffer(host_resource, buffer_size); + cudax::uninitialized_buffer device_buffer(managed_resource, buffer_size); + cudax::fill_bytes(_stream, not_yet_const_host_buffer, fill_byte); + + const auto& const_host_buffer = not_yet_const_host_buffer; + + cudax::copy_bytes(_stream, const_host_buffer, device_buffer); + check_result_and_erase(_stream, device_buffer); + + cudax::copy_bytes(_stream, cuda::std::span(const_host_buffer), device_buffer); + check_result_and_erase(_stream, device_buffer); + } + } + SECTION("Launch transform") + { + cuda::mr::pinned_memory_resource host_resource; + cudax::weird_buffer input(host_resource, buffer_size); + cudax::weird_buffer output(host_resource, buffer_size); + + memset(input.data, fill_byte, input.size * sizeof(int)); + + cudax::copy_bytes(_stream, input, output); + check_result_and_erase(_stream, cuda::std::span(output.data, output.size)); + } + + SECTION("Asymetric size") + { + cuda::mr::pinned_memory_resource host_resource; + cudax::uninitialized_buffer host_buffer(host_resource, 1); + cudax::fill_bytes(_stream, host_buffer, fill_byte); + + ::std::vector vec(buffer_size, 0xbeef); + + cudax::copy_bytes(_stream, host_buffer, vec); + _stream.wait(); + + CUDAX_REQUIRE(vec[0] == get_expected_value(fill_byte)); + CUDAX_REQUIRE(vec[1] == 0xbeef); + } +} diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu new file mode 100644 index 00000000000..7111aa848f3 --- /dev/null +++ b/cudax/test/algorithm/fill.cu @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include "common.cuh" + +TEST_CASE("Fill", "[data_manipulation]") +{ + cudax::stream _stream; + SECTION("Host resource") + { + cuda::mr::pinned_memory_resource host_resource; + cudax::uninitialized_buffer buffer(host_resource, buffer_size); + + cudax::fill_bytes(_stream, buffer, fill_byte); + + check_result_and_erase(_stream, cuda::std::span(buffer)); + } + + SECTION("Device resource") + { + cuda::mr::device_memory_resource device_resource; + cudax::uninitialized_buffer buffer(device_resource, buffer_size); + cudax::fill_bytes(_stream, buffer, fill_byte); + + std::vector host_vector(42); + CUDART(cudaMemcpyAsync( + host_vector.data(), buffer.data(), buffer.size() * sizeof(int), cudaMemcpyDefault, _stream.get())); + + check_result_and_erase(_stream, host_vector); + } + SECTION("Launch transform") + { + cuda::mr::pinned_memory_resource host_resource; + cudax::weird_buffer buffer(host_resource, buffer_size); + + cudax::fill_bytes(_stream, buffer, fill_byte); + check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size)); + } +}