CUB - Enable DPX Reduction (#2286)

Enable Hopper+ DPX (SIMD) reduction for `uint16_t/int16_t` data types and `Min/Max/Sum` operators
NVIDIA · Sep 6, 2024 · 3adc92a · 3adc92a
1 parent e0dad56
commit 3adc92a
Show file tree

Hide file tree

Showing 7 changed files with 473 additions and 89 deletions.
diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures.
+//       It specifically uses cub::Min instead of a user-defined operator.
+#define TUNE_T int16_t
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+using op_t = cub::Min;
+#include "base.cuh"
diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
@@ -50,6 +50,8 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH
 _CCCL_SUPPRESS_DEPRECATED_POP
 #include <cuda/std/type_traits>
 
+#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr
+
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
@@ -62,5 +64,101 @@ using invoke_result_t =
   ::cuda::std::invoke_result_t<Invokable, Args...>;
 #endif
 
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
+{
+  return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
+{
+  return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename...>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
+{
+  return false;
+}
+
+template <typename T, typename V, typename = void>
+struct has_binary_call_operator : ::cuda::std::false_type
+{};
+
+template <typename T, typename V>
+struct has_binary_call_operator<
+  T,
+  V,
+  ::cuda::std::void_t<decltype(::cuda::std::declval<T>()(::cuda::std::declval<V>(), ::cuda::std::declval<V>()))>>
+    : ::cuda::std::true_type
+{};
+
+/***********************************************************************************************************************
+ * Array like type traits
+ **********************************************************************************************************************/
+
+template <typename T, typename = void>
+struct has_subscript : ::cuda::std::false_type
+{};
+
+template <typename T>
+struct has_subscript<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>()[0])>> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_subscript_t = typename has_subscript<T>::type;
+
+template <typename T, typename = void>
+struct has_size : ::cuda::std::false_type
+{};
+
+// TODO: use ::cuda::std::size(::cuda::std::declval<T>()) when std::size will be available in libcu++
+template <typename T>
+struct has_size<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>().size())>> : ::cuda::std::true_type
+{};
+
+template <typename T, ::cuda::std::size_t N>
+struct has_size<T[N], void> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_size_t = typename has_size<T>::type;
+
+/***********************************************************************************************************************
+ * StaticSize: a type trait that returns the number of elements in an Array-like type
+ **********************************************************************************************************************/
+// StaticSize is useful where size(obj) cannot be checked at compile time
+// e.g.
+// using Array = NonTriviallyConstructible[8];
+// std::size(Array{})   // compile error
+// static_size<Array>() // ok
+
+template <typename T, typename = void>
+struct StaticSize
+{
+  static_assert(detail::always_false<T>(), "StaticSize not supported for this type");
+};
+
+template <typename T>
+struct StaticSize<T,
+                  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, ::cuda::std::declval<T>().size()>{})>>
+{
+  static_assert(::cuda::std::is_trivially_constructible<T>::value, "T must be trivially constructible");
+  static constexpr auto value = T{}.size();
+};
+
+template <typename T, ::cuda::std::size_t N>
+struct StaticSize<T[N], void>
+{
+  static constexpr auto value = N;
+};
+
+template <typename T>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size()
+{
+  return StaticSize<T>::value;
+}
+
 } // namespace detail
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
@@ -47,14 +47,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // always_false
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_type.cuh>
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-#include <cuda/std/functional>
-_CCCL_SUPPRESS_DEPRECATED_POP
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/type_traits> // cuda::std::common_type
+#include <cuda/std/utility> // cuda::std::forward
+
+// #include <functional> // std::plus
 
 CUB_NAMESPACE_BEGIN
 
@@ -413,4 +414,121 @@ _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
   return BinaryFlip<BinaryOpT>(binary_op);
 }
 
+namespace internal
+{
+// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+
+template <typename T>
+struct DpxMin
+{
+  static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
+};
+
+template <>
+struct DpxMin<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmins2(a, b);
+  }
+};
+
+template <>
+struct DpxMin<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vminu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxMax
+{
+  static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
+};
+
+template <>
+struct DpxMax<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxs2(a, b);
+  }
+};
+
+template <>
+struct DpxMax<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxSum
+{
+  static_assert(detail::always_false<T>(), "DpxSum is not supported for this type");
+};
+
+template <>
+struct DpxSum<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+template <>
+struct DpxSum<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename ReduceOp, typename T>
+struct CubOperatorToDpx
+{
+  static_assert(detail::always_false<T>(), "Dpx is not supported for this operator");
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Min, T>
+{
+  using type = DpxMin<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Max, T>
+{
+  using type = DpxMax<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Sum, T>
+{
+  using type = DpxSum<T>;
+};
+
+// template <typename T>
+// struct CubOperatorToDpx<std::plus<T>, T>
+//{
+//   using type = DpxSum<T>;
+// };
+
+template <typename ReduceOp, typename T>
+using cub_operator_to_dpx_t = CubOperatorToDpx<ReduceOp, T>;
+
+} // namespace internal
+
 CUB_NAMESPACE_END