NVIDIA · elstehle · Dec 20, 2024 · Oct 8, 2024 · Oct 29, 2024 · Oct 29, 2024
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+#ifndef TUNE_BASE
+#  define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
+#endif
+
+#if !TUNE_BASE
+template <typename AccumT, typename OffsetT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
+    static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
+    static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
+
+    using ReducePolicy =
+      cub::AgentReducePolicy<threads_per_block,
+                             items_per_thread,
+                             AccumT,
+                             items_per_vec_load,
+                             cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                             cub::LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OpT>
+void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
+{
+  // Offset type used within the kernel and to index within one partition
+  using per_partition_offset_t = int;
+
+  // Offset type used to index within the total input in the range [d_in, d_in + num_items)
+  using global_offset_t = ::cuda::std::int64_t;
+
+  // The value type of the KeyValuePair<global_offset_t, output_value_t> returned by the ArgIndexInputIterator
+  using output_value_t = T;
+
+  // Iterator providing the values being reduced
+  using values_it_t = T*;
+
+  // Iterator providing the input items for the reduction
+  using input_it_t = values_it_t;
+
+  // Type used for the final result
+  using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
+
+  auto const init = ::cuda::std::is_same<OpT, cub::ArgMin>::value ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<output_tuple_t, per_partition_offset_t>;
+  using dispatch_t = cub::detail::reduce::
+    DispatchStreamingArgReduce<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::detail::reduce::
+    DispatchStreamingArgReduce<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<output_tuple_t> out(1);
+
+  values_it_t d_in      = thrust::raw_pointer_cast(in.data());
+  output_tuple_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<output_tuple_t>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, 0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, launch.get_stream());
+  });
+}
+
+using global_offset_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;
+
+using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;
+
+NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "Operation{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
@@ -31,6 +31,8 @@ using complex = cuda::std::complex<float>;
 NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
+NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMin, "ArgMin", "cub::ArgMin");
+NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMax, "ArgMax", "cub::ArgMax");
 
 namespace detail
 {

@@ -46,8 +46,11 @@
 #include <cub/detail/nvtx.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
-#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
 #include <cub/util_deprecated.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/iterator/tabulate_output_iterator.h>
 
 #include <iterator>
 #include <limits>
@@ -471,14 +474,19 @@ struct DeviceReduce
 
     return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
+
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
   //! @rst
   //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
   //!
-  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
-  //!   (assuming the value type of ``d_in`` is ``T``)
-  //!
+  //! - The output value type assigned to ``d_out`` is ``cub::KeyValuePair<offset_t, T>``
+  //!   (Where  ``T`` corresponds to ``iterator_traits<d_out>::value_type::Value``, if the iterator value type of
+  //!   ``d_out`` is not void. Otherwise, ``T`` is the iterator value type of ``d_in``. ``offset_t`` is determined as
+  //!   follows:
+  //!   ``uint64_t`` if ``cub::KeyValuePair<uint64_t, T>`` is assignable to ``d_out``. Otherwise, ``int64_t`` if
+  //!   ``cub::KeyValuePair<int64_t, T>`` is assignable to ``d_out``. Otherwise, ``uint32_t`` if
+  //!   ``cub::KeyValuePair<uint32_t, T>`` is assignable to ``d_out``. Otherwise, ``int32_t``.
   //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
   //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
   //!
@@ -529,7 +537,7 @@ struct DeviceReduce
   //!
   //! @tparam OutputIteratorT
   //!   **[inferred]** Output iterator type for recording the reduced aggregate
-  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //!   (having value type `cub::KeyValuePair<offset_t, T>`) @iterator
   //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
@@ -557,38 +565,53 @@ struct DeviceReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
-
     // The input type
     using InputValueT = cub::detail::value_t<InputIteratorT>;
 
-    // The output tuple type
-    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+    // Offset type used within the kernel and to index within one partition
+    using PerPartitionOffsetT = int;
 
-    using AccumT = OutputTupleT;
+    // Offset type used to index within the total input in the range [d_in, d_in + num_items)
+    using GlobalOffsetT = ::cuda::std::int64_t;
 
-    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+    // Initial value type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<GlobalOffsetT, InputValueT>>;
+    using InitT        = typename OutputTupleT::Value;
 
-    // The output value type
-    using OutputValueT = typename OutputTupleT::Value;
-
-    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
-
-    ArgIndexInputIteratorT d_indexed_in(d_in);
+    // Reduction operation
+    using ReduceOpT = cub::ArgMin;
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
-
-    return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
+    InitT initial_value{Traits<InputValueT>::Max()};
+
+    // Helper transform output iterator, to allow "implicit conversion" between KeyValuePair types that have a different
+    // key type, which may happen if the user uses a different index type than the global offset type used by the
+    // algorithm
+    using implicit_cast_kv_pair_op_it =
+      THRUST_NS_QUALIFIER::tabulate_output_iterator<detail::reduce::write_to_user_out_it<OutputIteratorT>>;
+    implicit_cast_kv_pair_op_it out_it =
+      THRUST_NS_QUALIFIER::make_tabulate_output_iterator(detail::reduce::write_to_user_out_it<OutputIteratorT>{d_out});
+
+    return detail::reduce::DispatchStreamingArgReduce<
+      InputIteratorT,
+      implicit_cast_kv_pair_op_it,
+      PerPartitionOffsetT,
+      GlobalOffsetT,
+      ReduceOpT,
+      InitT>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       out_it,
+                       static_cast<GlobalOffsetT>(num_items),
+                       ReduceOpT{},
+                       initial_value,
+                       stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -598,7 +621,7 @@ struct DeviceReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -736,8 +759,15 @@ struct DeviceReduce
   //! Finds the first device-wide maximum using the greater-than (``>``)
   //! operator, also returning the index of that item
   //!
-  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
-  //!   (assuming the value type of ``d_in`` is ``T``)
+  //! - The output value type assigned to ``d_out`` is ``cub::KeyValuePair<offset_t, T>``
+  //!   (Where  ``T`` corresponds to ``iterator_traits<d_out>::value_type::Value``, if the iterator value type of
+  //!   ``d_out`` is not void. Otherwise, ``T`` is the iterator value type of ``d_in``. ``offset_t`` is determined as
+  //!   follows:
+  //!   ``uint64_t`` if ``cub::KeyValuePair<uint64_t, T>`` is assignable to ``d_out``. Otherwise, ``int64_t`` if
+  //!   ``cub::KeyValuePair<int64_t, T>`` is assignable to ``d_out``. Otherwise, ``uint32_t`` if
+  //!   ``cub::KeyValuePair<uint32_t, T>`` is assignable to ``d_out``. Otherwise, ``int32_t``.
+  //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
+  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
   //!
   //!   - The maximum is written to ``d_out.value`` and its offset in the input
   //!     array is written to ``d_out.key``.
@@ -792,7 +822,7 @@ struct DeviceReduce
   //!
   //! @tparam OutputIteratorT
   //!   **[inferred]** Output iterator type for recording the reduced aggregate
-  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //!   (having value type `cub::KeyValuePair<offset_t, T>`) @iterator
   //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
@@ -820,38 +850,53 @@ struct DeviceReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
-
     // The input type
     using InputValueT = cub::detail::value_t<InputIteratorT>;
 
-    // The output tuple type
-    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+    // Offset type used within the kernel and to index within one partition
+    using PerPartitionOffsetT = int;
 
-    using AccumT = OutputTupleT;
+    // Offset type used to index within the total input in the range [d_in, d_in + num_items)
+    using GlobalOffsetT = ::cuda::std::int64_t;
 
-    // The output value type
-    using OutputValueT = typename OutputTupleT::Value;
-
-    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
-
-    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+    // Initial value type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<GlobalOffsetT, InputValueT>>;
+    using InitT        = typename OutputTupleT::Value;
 
-    ArgIndexInputIteratorT d_indexed_in(d_in);
+    // Reduction operation
+    using ReduceOpT = cub::ArgMax;
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
-
-    return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
+    InitT initial_value{Traits<InputValueT>::Lowest()};
+
+    // Helper transform output iterator, to allow "implicit conversion" between KeyValuePair types that have a different
+    // key type, which may happen if the user uses a different index type than the global offset type used by the
+    // algorithm
+    using implicit_cast_kv_pair_op_it =
+      THRUST_NS_QUALIFIER::tabulate_output_iterator<detail::reduce::write_to_user_out_it<OutputIteratorT>>;
+    implicit_cast_kv_pair_op_it out_it =
+      THRUST_NS_QUALIFIER::make_tabulate_output_iterator(detail::reduce::write_to_user_out_it<OutputIteratorT>{d_out});
+
+    return detail::reduce::DispatchStreamingArgReduce<
+      InputIteratorT,
+      implicit_cast_kv_pair_op_it,
+      PerPartitionOffsetT,
+      GlobalOffsetT,
+      ReduceOpT,
+      InitT>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       out_it,
+                       static_cast<GlobalOffsetT>(num_items),
+                       ReduceOpT{},
+                       initial_value,
+                       stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -861,7 +906,7 @@ struct DeviceReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {