Merge branch 'main' into mdspan-dims

NVIDIA · Nov 25, 2024 · b490614 · b490614
2 parents da1ffe0 + a085ba1
commit b490614
Show file tree

Hide file tree

Showing 204 changed files with 1,985 additions and 1,231 deletions.
diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h
@@ -35,7 +35,24 @@
 #include <c2h/vector.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#    if _CCCL_CUDACC_AT_LEAST(11, 8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#      include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
+#  endif // _CCCL_HAS_NVBF16
+
 #  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
@@ -106,23 +106,19 @@ template <int _BLOCK_THREADS,
           int _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
-  enum
-  {
-    /// Threads per thread block
-    BLOCK_THREADS = _BLOCK_THREADS,
-
-    /// Pixels per thread (per tile of input)
-    PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
+  /// Threads per thread block
+  static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
+  /// Pixels per thread (per tile of input)
+  static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;
 
-    /// Whether to perform localized RLE to compress samples before histogramming
-    IS_RLE_COMPRESS = _RLE_COMPRESS,
+  /// Whether to perform localized RLE to compress samples before histogramming
+  static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;
 
-    /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    MEM_PREFERENCE = _MEM_PREFERENCE,
+  /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;
 
-    /// Whether to dequeue tiles from a global work queue
-    IS_WORK_STEALING = _WORK_STEALING,
-  };
+  /// Whether to dequeue tiles from a global work queue
+  static constexpr bool IS_WORK_STEALING = _WORK_STEALING;
 
   /// Vector size for samples loading (1, 2, 4)
   static constexpr int VEC_SIZE = _VEC_SIZE;
@@ -202,23 +198,21 @@ struct AgentHistogram
   using VecT                   = typename CubVector<SampleT, VecSize>::Type;
 
   /// Constants
-  enum
-  {
-    BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
+  static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;
 
-    PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-    SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
-    VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize,
+  static constexpr int PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD;
+  static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
+  static constexpr int VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize;
 
-    TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS,
-    TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
+  static constexpr int TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS;
+  static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;
 
-    IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+  static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;
 
-    MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
+    (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;
 
-    IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
-  };
+  static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;
 
   /// Cache load modifier for reading input elements
   static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;

diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh
@@ -37,16 +37,16 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // implicit_prom_t
+#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED
+
 #include <cuda/cmath> // cuda::std::ceil_div
 #include <cuda/std/bit> // std::has_single_bit
 #include <cuda/std/climits> // CHAR_BIT
 #include <cuda/std/cstdint> // uint64_t
 #include <cuda/std/limits> // numeric_limits
 #include <cuda/std/type_traits> // std::is_integral
 
-#include "cub/detail/type_traits.cuh" // implicit_prom_t
-#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED
-
 #if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
 _CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
 #endif // CCCL_ENABLE_DEVICE_ASSERTIONS

diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -419,7 +419,7 @@ struct dispatch_histogram
         privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin());
       ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin());
 
-      auto minus_one = cuda::proclaim_return_type<int>([](int levels) {
+      auto minus_one = ::cuda::proclaim_return_type<int>([](int levels) {
         return levels - 1;
       });
       ::cuda::std::transform(

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
@@ -56,6 +56,17 @@
 #include <cuda/std/type_traits> // cuda::std::common_type
 #include <cuda/std/utility> // cuda::std::forward
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 // TODO(bgruber): deprecate in C++17 with a note: "replace by decltype(cuda::std::not_fn(EqualityOp{}))"

diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
@@ -54,6 +54,17 @@
 #include <cuda/std/cstdint> // uint16_t
 #include <cuda/std/functional> // cuda::std::plus
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst

diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
@@ -50,7 +50,16 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
 #if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
 #  if _CCCL_CUDACC_AT_LEAST(11, 8)
 // cuda_fp8.h resets default for C4127, so we have to guard the inclusion
 _CCCL_DIAG_PUSH

diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh
@@ -23,15 +23,17 @@
 
 #include <cuda/std/__ranges/concepts.h>
 #include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/mdspan>
 #include <cuda/std/span>
 
 #include <cuda/experimental/__launch/launch_transform.cuh>
 
 namespace cuda::experimental
 {
+
 #if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
 template <typename _Tp>
-concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
+concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
 
 #else
 template <typename _Tp, typename = int>
@@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
     int>> = true;
 
 template <typename _Tp>
-inline constexpr bool __valid_copy_fill_argument =
+inline constexpr bool __valid_1d_copy_fill_argument =
   _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;
 
 #endif
 
+template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
+using __as_mdspan_t =
+  _CUDA_VSTD::mdspan<typename _Decayed::value_type,
+                     typename _Decayed::extents_type,
+                     typename _Decayed::layout_type,
+                     typename _Decayed::accessor_type>;
+
+template <typename _Tp, typename = int>
+inline constexpr bool __convertible_to_mdspan = false;
+
+template <typename _Tp>
+inline constexpr bool
+  __convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
+    true;
+
+template <typename _Tp>
+inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;
+
 } // namespace cuda::experimental
 #endif //__CUDAX_ALGORITHM_COMMON
diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
@@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 
 //! @brief Launches a bytewise memory copy from source to destination into the provided stream.
 //!
-//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
+//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
+//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
 //! Both source and destination type is required to be trivially copyable.
 //!
 //! This call might be synchronous if either source or destination is pagable host memory.
@@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 //! @param __src Source to copy from
 //! @param __dst Destination to copy into
 _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
 void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
 {
   __copy_bytes_impl(
@@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
       detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
 }
 
+template <typename _Extents, typename _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents = false;
+
+template <typename _IndexType,
+          _CUDA_VSTD::size_t... _Extents,
+          typename _OtherIndexType,
+          _CUDA_VSTD::size_t... _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
+                                                      _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
+  decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
+    _CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
+    _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
+    _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;
+
+template <typename _SrcExtents, typename _DstExtents>
+_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
+{
+  for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
+  {
+    if (__src_exts.extent(__i)
+        != static_cast<typename _SrcExtents::index_type>(
+          __dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename _SrcElem,
+          typename _SrcExtents,
+          typename _SrcLayout,
+          typename _SrcAccessor,
+          typename _DstElem,
+          typename _DstExtents,
+          typename _DstLayout,
+          typename _DstAccessor>
+void __nd_copy_bytes_impl(stream_ref __stream,
+                          _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
+                          _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
+{
+  static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
+                "Multidimensional copy requires both source and destination extents to be compatible");
+  static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
+                "Multidimensional copy requires both source and destination layouts to match");
+
+  if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
+  {
+    _CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
+  }
+
+  __copy_bytes_impl(__stream,
+                    _CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
+                    _CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
+}
+
+//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
+//!
+//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
+//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
+//! destination type is required to be trivially copyable.
+//!
+//! This call might be synchronous if either source or destination is pagable host memory.
+//! It will be synchronous if both destination and copy is located in host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __src Source to copy from
+//! @param __dst Destination to copy into
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
+void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
+{
+  decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __src_as_arg      = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  __nd_copy_bytes_impl(
+    __stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_COPY
diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh
@@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
 
 //! @brief Launches an operation to bytewise fill the memory into the provided stream.
 //!
-//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
-//! into one. It can't reside in pagable host memory.
+//! Destination needs to either be a `contiguous_range` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
 //! Destination type is required to be trivially copyable.
 //!
+//! Destination can't reside in pagable host memory.
+//!
 //! @param __stream Stream that the copy should be inserted into
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
 _CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
 void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
 {
   __fill_bytes_impl(__stream,
@@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
                     __value);
 }
 
+//! @brief Launches an operation to bytewise fill the memory into the provided stream.
+//!
+//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`.   Destination
+//! type is required to be trivially copyable.
+//!
+//! Destination can't reside in pagable host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __dst Destination memory to fill
+//! @param __value Value to fill into every byte in the destination
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
+void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
+{
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  auto __dst_mdspan                = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);
+
+  __fill_bytes_impl(
+    __stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_FILL