Skip to content

Commit

Permalink
Merge branch 'main' into mdspan-dims
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Nov 25, 2024
2 parents da1ffe0 + a085ba1 commit b490614
Show file tree
Hide file tree
Showing 204 changed files with 1,985 additions and 1,231 deletions.
19 changes: 18 additions & 1 deletion c2h/include/c2h/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,24 @@
#include <c2h/vector.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <cub/util_type.cuh> // for <cuda_fp8.h>
# if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
# endif // _CCCL_HAS_NVFP16

# if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP

# if _CCCL_CUDACC_AT_LEAST(11, 8)
// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
_CCCL_DIAG_PUSH
# include <cuda_fp8.h>
_CCCL_DIAG_POP
# endif // _CCCL_CUDACC_AT_LEAST(11, 8)
# endif // _CCCL_HAS_NVBF16

# if defined(__CUDA_FP8_TYPES_EXIST__)
namespace std
{
Expand Down
46 changes: 20 additions & 26 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -106,23 +106,19 @@ template <int _BLOCK_THREADS,
int _VEC_SIZE = 4>
struct AgentHistogramPolicy
{
enum
{
/// Threads per thread block
BLOCK_THREADS = _BLOCK_THREADS,

/// Pixels per thread (per tile of input)
PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
/// Threads per thread block
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
/// Pixels per thread (per tile of input)
static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;

/// Whether to perform localized RLE to compress samples before histogramming
IS_RLE_COMPRESS = _RLE_COMPRESS,
/// Whether to perform localized RLE to compress samples before histogramming
static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;

/// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
MEM_PREFERENCE = _MEM_PREFERENCE,
/// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;

/// Whether to dequeue tiles from a global work queue
IS_WORK_STEALING = _WORK_STEALING,
};
/// Whether to dequeue tiles from a global work queue
static constexpr bool IS_WORK_STEALING = _WORK_STEALING;

/// Vector size for samples loading (1, 2, 4)
static constexpr int VEC_SIZE = _VEC_SIZE;
Expand Down Expand Up @@ -202,23 +198,21 @@ struct AgentHistogram
using VecT = typename CubVector<SampleT, VecSize>::Type;

/// Constants
enum
{
BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;

PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize,
static constexpr int PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD;
static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
static constexpr int VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize;

TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
static constexpr int TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS;
static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;

IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;

MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
(PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;

IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
};
static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;

/// Cache load modifier for reading input elements
static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@
# pragma system_header
#endif // no system header

#include <cub/detail/type_traits.cuh> // implicit_prom_t
#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED

#include <cuda/cmath> // cuda::std::ceil_div
#include <cuda/std/bit> // std::has_single_bit
#include <cuda/std/climits> // CHAR_BIT
#include <cuda/std/cstdint> // uint64_t
#include <cuda/std/limits> // numeric_limits
#include <cuda/std/type_traits> // std::is_integral

#include "cub/detail/type_traits.cuh" // implicit_prom_t
#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED

#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
_CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ struct dispatch_histogram
privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin());
::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin());

auto minus_one = cuda::proclaim_return_type<int>([](int levels) {
auto minus_one = ::cuda::proclaim_return_type<int>([](int levels) {
return levels - 1;
});
::cuda::std::transform(
Expand Down
11 changes: 11 additions & 0 deletions cub/cub/thread/thread_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
#include <cuda/std/type_traits> // cuda::std::common_type
#include <cuda/std/utility> // cuda::std::forward

#if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
#endif // _CCCL_HAS_NVFP16

#if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP
#endif // _CCCL_HAS_NVFP16

CUB_NAMESPACE_BEGIN

// TODO(bgruber): deprecate in C++17 with a note: "replace by decltype(cuda::std::not_fn(EqualityOp{}))"
Expand Down
11 changes: 11 additions & 0 deletions cub/cub/thread/thread_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@
#include <cuda/std/cstdint> // uint16_t
#include <cuda/std/functional> // cuda::std::plus

#if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
#endif // _CCCL_HAS_NVFP16

#if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP
#endif // _CCCL_HAS_NVFP16

CUB_NAMESPACE_BEGIN

//! @rst
Expand Down
9 changes: 9 additions & 0 deletions cub/cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,16 @@
#include <cuda/std/limits>
#include <cuda/std/type_traits>

#if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
#endif // _CCCL_HAS_NVFP16

#if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP

# if _CCCL_CUDACC_AT_LEAST(11, 8)
// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
_CCCL_DIAG_PUSH
Expand Down
24 changes: 22 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@

#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__type_traits/is_convertible.h>
#include <cuda/std/mdspan>
#include <cuda/std/span>

#include <cuda/experimental/__launch/launch_transform.cuh>

namespace cuda::experimental
{

#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
template <typename _Tp>
concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;

#else
template <typename _Tp, typename = int>
Expand All @@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
int>> = true;

template <typename _Tp>
inline constexpr bool __valid_copy_fill_argument =
inline constexpr bool __valid_1d_copy_fill_argument =
_CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;

#endif

template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
using __as_mdspan_t =
_CUDA_VSTD::mdspan<typename _Decayed::value_type,
typename _Decayed::extents_type,
typename _Decayed::layout_type,
typename _Decayed::accessor_type>;

template <typename _Tp, typename = int>
inline constexpr bool __convertible_to_mdspan = false;

template <typename _Tp>
inline constexpr bool
__convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
true;

template <typename _Tp>
inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;

} // namespace cuda::experimental
#endif //__CUDAX_ALGORITHM_COMMON
86 changes: 84 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
//! Both source and destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
Expand All @@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
__copy_bytes_impl(
Expand All @@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
}

template <typename _Extents, typename _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents = false;

template <typename _IndexType,
_CUDA_VSTD::size_t... _Extents,
typename _OtherIndexType,
_CUDA_VSTD::size_t... _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
_CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
_CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
_CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;

template <typename _SrcExtents, typename _DstExtents>
_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
{
for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
{
if (__src_exts.extent(__i)
!= static_cast<typename _SrcExtents::index_type>(
__dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
{
return false;
}
}
return true;
}

template <typename _SrcElem,
typename _SrcExtents,
typename _SrcLayout,
typename _SrcAccessor,
typename _DstElem,
typename _DstExtents,
typename _DstLayout,
typename _DstAccessor>
void __nd_copy_bytes_impl(stream_ref __stream,
_CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
_CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
{
static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
"Multidimensional copy requires both source and destination extents to be compatible");
static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
"Multidimensional copy requires both source and destination layouts to match");

if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
{
_CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
}

__copy_bytes_impl(__stream,
_CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
_CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
}

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
//! destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
//! It will be synchronous if both destination and copy is located in host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __src_as_arg = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
__nd_copy_bytes_impl(
__stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_COPY
32 changes: 29 additions & 3 deletions cudax/include/cuda/experimental/__algorithm/fill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
//! into one. It can't reside in pagable host memory.
//! Destination needs to either be a `contiguous_range` or launch transform
//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
//! Destination type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
__fill_bytes_impl(__stream,
Expand All @@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
__value);
}

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination
//! type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
auto __dst_mdspan = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);

__fill_bytes_impl(
__stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_FILL
Loading

0 comments on commit b490614

Please sign in to comment.