Skip to content

Commit

Permalink
[CUDAX] Add initial bits of copy_bytes and fill_bytes (#2608)
Browse files Browse the repository at this point in the history
  • Loading branch information
pciolkosz authored Nov 1, 2024
1 parent 0b02ae7 commit b6323ce
Show file tree
Hide file tree
Showing 11 changed files with 496 additions and 20 deletions.
54 changes: 54 additions & 0 deletions cudax/include/cuda/experimental/__algorithm/common.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_ALGORITHM_COMMON
#define __CUDAX_ALGORITHM_COMMON

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__type_traits/is_convertible.h>
#include <cuda/std/span>

#include <cuda/experimental/__launch/launch_transform.cuh>

namespace cuda::experimental
{
#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
template <typename _Tp>
concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;

#else
template <typename _Tp, typename = int>
inline constexpr bool __convertible_to_span = false;

template <typename _Tp>
inline constexpr bool __convertible_to_span<
_Tp,
_CUDA_VSTD::enable_if_t<
_CUDA_VSTD::is_convertible_v<_Tp, _CUDA_VSTD::span<typename _CUDA_VSTD::decay_t<_Tp>::value_type>>,
int>> = true;

template <typename _Tp>
inline constexpr bool __valid_copy_fill_argument =
_CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;

#endif

} // namespace cuda::experimental
#endif //__CUDAX_ALGORITHM_COMMON
79 changes: 79 additions & 0 deletions cudax/include/cuda/experimental/__algorithm/copy.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_ALGORITHM_COPY
#define __CUDAX_ALGORITHM_COPY

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__concepts/__concept_macros.h>

#include <cuda/experimental/__algorithm/common.cuh>
#include <cuda/experimental/__stream/stream_ref.cuh>

namespace cuda::experimental
{

template <typename _SrcTy, typename _DstTy>
void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUDA_VSTD::span<_DstTy> __dst)
{
static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Copy destination can't be const");
static_assert(_CUDA_VSTD::is_trivially_copyable_v<_SrcTy> && _CUDA_VSTD::is_trivially_copyable_v<_DstTy>);

if (__src.size_bytes() > __dst.size_bytes())
{
_CUDA_VSTD::__throw_invalid_argument("Copy destination is too small to fit the source data");
}

// TODO pass copy direction hint once we have span with properties
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"Failed to perform a copy",
__dst.data(),
__src.data(),
__src.size_bytes(),
cudaMemcpyDefault,
__stream.get());
}

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be a `contiguous_range` or implicitly
//! implicitly/launch transform to one.
//! Both source and destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
//! It will be synchronous if both destination and copy is located in host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_LIBCUDACXX_TEMPLATE(typename _SrcTy, typename _DstTy)
_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_SrcTy> _LIBCUDACXX_AND __valid_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
__copy_bytes_impl(
__stream,
_CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_SrcTy>>(
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src)))),
_CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_DstTy>>(
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_COPY
63 changes: 63 additions & 0 deletions cudax/include/cuda/experimental/__algorithm/fill.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_ALGORITHM_FILL
#define __CUDAX_ALGORITHM_FILL

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__concepts/__concept_macros.h>

#include <cuda/experimental/__algorithm/common.cuh>
#include <cuda/experimental/__stream/stream_ref.cuh>

namespace cuda::experimental
{

template <typename _DstTy, ::std::size_t _DstSize>
void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> __dst, uint8_t __value)
{
static_assert(!_CUDA_VSTD::is_const_v<_DstTy>, "Fill destination can't be const");
static_assert(_CUDA_VSTD::is_trivially_copyable_v<_DstTy>);

// TODO do a host callback if not device accessible?
_CCCL_TRY_CUDA_API(
::cudaMemsetAsync, "Failed to perform a fill", __dst.data(), __value, __dst.size_bytes(), __stream.get());
}

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
//! into one. It can't reside in pagable host memory.
//! Destination type is required to be trivially copyable.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_LIBCUDACXX_TEMPLATE(typename _DstTy)
_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
__fill_bytes_impl(__stream,
_CUDA_VSTD::span(static_cast<detail::__as_copy_arg_t<_DstTy>>(
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))),
__value);
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_FILL
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,24 @@ private:

//! @brief Causes the buffer to be treated as a span when passed to cudax::launch.
//! @pre The buffer must have the cuda::mr::device_accessible property.
_CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp>
__cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
"The buffer must be device accessible to be passed to `launch`");
// TODO add auto synchronization
return {__self.__get_data(), __self.size()};
}

//! @brief Causes the buffer to be treated as a span when passed to cudax::launch
//! @pre The buffer must have the cuda::mr::device_accessible property.
_CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<const _Tp>
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND auto
__cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
"The buffer must be device accessible to be passed to `launch`");
// TODO add auto synchronization
return {__self.__get_data(), __self.size()};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,21 @@ private:

//! @brief Causes the buffer to be treated as a span when passed to cudax::launch.
//! @pre The buffer must have the cuda::mr::device_accessible property.
_CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp>
__cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
"The buffer must be device accessible to be passed to `launch`");
return {__self.__get_data(), __self.size()};
}

//! @brief Causes the buffer to be treated as a span when passed to cudax::launch
//! @pre The buffer must have the cuda::mr::device_accessible property.
_CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<const _Tp>
__cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
static_assert(_CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>,
"The buffer must be device accessible to be passed to `launch`");
return {__self.__get_data(), __self.size()};
}

Expand Down
20 changes: 15 additions & 5 deletions cudax/include/cuda/experimental/__launch/launch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,29 @@ template <typename _Arg>
using __launch_transform_result_t = decltype(__fn{}(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));

template <typename _Arg, typename _Enable = void>
struct __as_kernel_arg
struct __as_copy_arg
{
using type = _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>;
using type = __launch_transform_result_t<_Arg>;
};

// Copy needs to know if original value is a reference
template <typename _Arg>
struct __as_kernel_arg<
_Arg,
_CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg>>
struct __as_copy_arg<_Arg,
_CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg>>
{
using type = typename _CUDA_VSTD::decay_t<__launch_transform_result_t<_Arg>>::__as_kernel_arg;
};

template <typename _Arg>
using __as_copy_arg_t = typename detail::__as_copy_arg<_Arg>::type;

// While kernel argument can't be a reference
template <typename _Arg>
struct __as_kernel_arg
{
using type = _CUDA_VSTD::decay_t<typename __as_copy_arg<_Arg>::type>;
};

_CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
} // namespace detail

Expand Down
17 changes: 17 additions & 0 deletions cudax/include/cuda/experimental/algorithm.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_ALGORITHM__
#define __CUDAX_ALGORITHM__

#include <cuda/experimental/__algorithm/copy.cuh>
#include <cuda/experimental/__algorithm/fill.cuh>

#endif // __CUDAX_ALGORITHM__
6 changes: 6 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ foreach(cn_target IN LISTS cudax_TARGETS)
cudax_add_catch2_test(test_target green_context ${cn_target}
green_context/green_ctx_smoke.cu
)

cudax_add_catch2_test(test_target algorithm ${cn_target}
algorithm/fill.cu
algorithm/copy.cu
)

endforeach()

# FIXME: Enable MSVC
Expand Down
Loading

0 comments on commit b6323ce

Please sign in to comment.