Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for large num_items to DeviceReduce::{ArgMin,ArgMax} #2647

Merged
merged 34 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
31e555d
adds benchmarks for reduce::arg{min,max}
elstehle Oct 8, 2024
30b88a1
preliminary streaming arg-extremum reduction
elstehle Oct 29, 2024
3c5f322
fixes implicit conversion
elstehle Oct 29, 2024
cc95d62
uses streaming dispatch class
elstehle Oct 29, 2024
dad724e
changes arg benches to use new streaming reduce
elstehle Oct 29, 2024
3f4cbf4
streaming arg-extrema reduction
elstehle Oct 29, 2024
1895d38
fixes style
elstehle Oct 29, 2024
ee341e1
fixes compilation failures
elstehle Oct 30, 2024
1c8a62c
cleanups
elstehle Oct 30, 2024
f8cca48
adds rst style comments
elstehle Oct 30, 2024
757672b
declare vars const and use clamp
elstehle Oct 30, 2024
14bb6ad
consolidates argmin argmax benchmarks
elstehle Oct 30, 2024
92730b6
fixes thrust usage
elstehle Oct 30, 2024
d1cac78
drops offset type in arg-extrema benchmarks
elstehle Oct 31, 2024
6600bcf
fixes clang cuda
elstehle Oct 31, 2024
1d6e6b3
exec space macros
elstehle Oct 31, 2024
39bffee
switch to signed global offset type for slightly better perf
elstehle Oct 31, 2024
4ffe18b
clarifies documentation
elstehle Oct 31, 2024
bb72eb0
Merge remote-tracking branch 'upstream/main' into enh/large-num-items…
elstehle Nov 1, 2024
5707234
applies minor benchmark style changes from review comments
elstehle Nov 1, 2024
4c0ea6c
fixes interface documentation and comments
elstehle Nov 1, 2024
4dcddfb
list-init accumulating output op
elstehle Nov 1, 2024
ee1e1dc
improves style, comments, and tests
elstehle Nov 1, 2024
0d07c25
cleans up aggregate init
elstehle Nov 1, 2024
6d559fb
renames dispatch class usage in benchmarks
elstehle Nov 1, 2024
326d3b7
Merge remote-tracking branch 'upstream/main' into enh/large-num-items…
elstehle Dec 3, 2024
cad6b8a
fixes merge conflicts
elstehle Dec 3, 2024
e57e453
addresses review comments
elstehle Dec 4, 2024
ba7e12b
addresses review comments
elstehle Dec 6, 2024
952ebc2
fixes assertion
elstehle Dec 6, 2024
7544fc2
Merge branch 'main' into enh/large-num-items-reduce-argminmax
elstehle Dec 19, 2024
b1afa36
removes superseded implementation
elstehle Dec 19, 2024
33393d0
changes large problem tests to use new interface
elstehle Dec 19, 2024
1effe5f
removes obsolete tests for deprecated interface
elstehle Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions cub/benchmarks/bench/reduce/arg_extrema.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include <cub/device/device_reduce.cuh>
#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>

#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

#ifndef TUNE_BASE
# define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
#endif

#if !TUNE_BASE
elstehle marked this conversation as resolved.
Show resolved Hide resolved
template <typename AccumT, typename OffsetT>
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
{
static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK;
static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD;
static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;

using ReducePolicy =
cub::AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
cub::LOAD_DEFAULT>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
elstehle marked this conversation as resolved.
Show resolved Hide resolved
};

using MaxPolicy = policy_t;
};
#endif // !TUNE_BASE

template <typename T, typename OpT>
void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
{
// Offset type used within the kernel and to index within one partition
using per_partition_offset_t = int;

// Offset type used to index within the total input in the range [d_in, d_in + num_items)
using global_offset_t = ::cuda::std::int64_t;

// The value type of the KeyValuePair<global_offset_t, output_value_t> returned by the ArgIndexInputIterator
using output_value_t = T;

// Iterator providing the values being reduced
using values_it_t = T*;

// Iterator providing the input items for the reduction
using input_it_t = values_it_t;

// Type used for the final result
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;

auto const init = ::cuda::std::is_same<OpT, cub::ArgMin>::value ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();

#if !TUNE_BASE
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
using dispatch_t = cub::detail::reduce::
DispatchStreamingArgReduce<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T, policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::detail::reduce::
DispatchStreamingArgReduce<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T>;
#endif // TUNE_BASE

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> in = generate(elements);
thrust::device_vector<output_tuple_t> out(1);

values_it_t d_in = thrust::raw_pointer_cast(in.data());
output_tuple_t* d_out = thrust::raw_pointer_cast(out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<output_tuple_t>(1);

// Allocate temporary storage:
std::size_t temp_size;
dispatch_t::Dispatch(
nullptr, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, 0 /* stream */);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
temp_storage, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, launch.get_stream());
});
}

using global_offset_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;

elstehle marked this conversation as resolved.
Show resolved Hide resolved
using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;

NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "Operation{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ using complex = cuda::std::complex<float>;
NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMin, "ArgMin", "cub::ArgMin");
NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMax, "ArgMax", "cub::ArgMax");

namespace detail
{
Expand Down
141 changes: 93 additions & 48 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_type.cuh>

#include <thrust/iterator/tabulate_output_iterator.h>

#include <iterator>
#include <limits>
Expand Down Expand Up @@ -471,14 +474,19 @@ struct DeviceReduce

return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
}

#endif // DOXYGEN_SHOULD_SKIP_THIS

//! @rst
//! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
//!
//! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
//! (assuming the value type of ``d_in`` is ``T``)
//!
//! - The output value type assigned to ``d_out`` is ``cub::KeyValuePair<offset_t, T>``
//! (Where ``T`` corresponds to ``iterator_traits<d_out>::value_type::Value``, if the iterator value type of
//! ``d_out`` is not void. Otherwise, ``T`` is the iterator value type of ``d_in``. ``offset_t`` is determined as
//! follows:
//! ``uint64_t`` if ``cub::KeyValuePair<uint64_t, T>`` is assignable to ``d_out``. Otherwise, ``int64_t`` if
//! ``cub::KeyValuePair<int64_t, T>`` is assignable to ``d_out``. Otherwise, ``uint32_t`` if
//! ``cub::KeyValuePair<uint32_t, T>`` is assignable to ``d_out``. Otherwise, ``int32_t``.
elstehle marked this conversation as resolved.
Show resolved Hide resolved
//! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
Expand Down Expand Up @@ -529,7 +537,7 @@ struct DeviceReduce
//!
//! @tparam OutputIteratorT
//! **[inferred]** Output iterator type for recording the reduced aggregate
//! (having value type `cub::KeyValuePair<int, T>`) @iterator
//! (having value type `cub::KeyValuePair<offset_t, T>`) @iterator
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
Expand Down Expand Up @@ -557,38 +565,53 @@ struct DeviceReduce
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
::cuda::std::int64_t num_items,
cudaStream_t stream = 0)
elstehle marked this conversation as resolved.
Show resolved Hide resolved
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");

// Signed integer type for global offsets
using OffsetT = int;

// The input type
using InputValueT = cub::detail::value_t<InputIteratorT>;

// The output tuple type
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
// Offset type used within the kernel and to index within one partition
using PerPartitionOffsetT = int;

using AccumT = OutputTupleT;
// Offset type used to index within the total input in the range [d_in, d_in + num_items)
using GlobalOffsetT = ::cuda::std::int64_t;

using InitT = detail::reduce::empty_problem_init_t<AccumT>;
// Initial value type
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<GlobalOffsetT, InputValueT>>;
elstehle marked this conversation as resolved.
Show resolved Hide resolved
using InitT = typename OutputTupleT::Value;

// The output value type
using OutputValueT = typename OutputTupleT::Value;

// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;

ArgIndexInputIteratorT d_indexed_in(d_in);
// Reduction operation
using ReduceOpT = cub::ArgMin;

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
InitT initial_value{Traits<InputValueT>::Max()};

// Helper transform output iterator, to allow "implicit conversion" between KeyValuePair types that have a different
// key type, which may happen if the user uses a different index type than the global offset type used by the
// algorithm
using implicit_cast_kv_pair_op_it =
THRUST_NS_QUALIFIER::tabulate_output_iterator<detail::reduce::write_to_user_out_it<OutputIteratorT>>;
implicit_cast_kv_pair_op_it out_it =
THRUST_NS_QUALIFIER::make_tabulate_output_iterator(detail::reduce::write_to_user_out_it<OutputIteratorT>{d_out});
elstehle marked this conversation as resolved.
Show resolved Hide resolved

return detail::reduce::DispatchStreamingArgReduce<
InputIteratorT,
implicit_cast_kv_pair_op_it,
PerPartitionOffsetT,
GlobalOffsetT,
ReduceOpT,
InitT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
out_it,
static_cast<GlobalOffsetT>(num_items),
ReduceOpT{},
initial_value,
stream);
}

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
Expand All @@ -598,7 +621,7 @@ struct DeviceReduce
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
::cuda::std::int64_t num_items,
cudaStream_t stream,
bool debug_synchronous)
{
Expand Down Expand Up @@ -736,8 +759,15 @@ struct DeviceReduce
//! Finds the first device-wide maximum using the greater-than (``>``)
//! operator, also returning the index of that item
//!
//! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
//! (assuming the value type of ``d_in`` is ``T``)
//! - The output value type assigned to ``d_out`` is ``cub::KeyValuePair<offset_t, T>``
//! (Where ``T`` corresponds to ``iterator_traits<d_out>::value_type::Value``, if the iterator value type of
//! ``d_out`` is not void. Otherwise, ``T`` is the iterator value type of ``d_in``. ``offset_t`` is determined as
//! follows:
//! ``uint64_t`` if ``cub::KeyValuePair<uint64_t, T>`` is assignable to ``d_out``. Otherwise, ``int64_t`` if
//! ``cub::KeyValuePair<int64_t, T>`` is assignable to ``d_out``. Otherwise, ``uint32_t`` if
//! ``cub::KeyValuePair<uint32_t, T>`` is assignable to ``d_out``. Otherwise, ``int32_t``.
//! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
//! - The maximum is written to ``d_out.value`` and its offset in the input
//! array is written to ``d_out.key``.
Expand Down Expand Up @@ -792,7 +822,7 @@ struct DeviceReduce
//!
//! @tparam OutputIteratorT
//! **[inferred]** Output iterator type for recording the reduced aggregate
//! (having value type `cub::KeyValuePair<int, T>`) @iterator
//! (having value type `cub::KeyValuePair<offset_t, T>`) @iterator
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
Expand Down Expand Up @@ -820,38 +850,53 @@ struct DeviceReduce
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
::cuda::std::int64_t num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");

// Signed integer type for global offsets
using OffsetT = int;

// The input type
using InputValueT = cub::detail::value_t<InputIteratorT>;

// The output tuple type
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
// Offset type used within the kernel and to index within one partition
using PerPartitionOffsetT = int;

using AccumT = OutputTupleT;
// Offset type used to index within the total input in the range [d_in, d_in + num_items)
using GlobalOffsetT = ::cuda::std::int64_t;

// The output value type
using OutputValueT = typename OutputTupleT::Value;

using InitT = detail::reduce::empty_problem_init_t<AccumT>;

// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
// Initial value type
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<GlobalOffsetT, InputValueT>>;
using InitT = typename OutputTupleT::Value;

ArgIndexInputIteratorT d_indexed_in(d_in);
// Reduction operation
using ReduceOpT = cub::ArgMax;

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
InitT initial_value{Traits<InputValueT>::Lowest()};

// Helper transform output iterator, to allow "implicit conversion" between KeyValuePair types that have a different
// key type, which may happen if the user uses a different index type than the global offset type used by the
// algorithm
using implicit_cast_kv_pair_op_it =
THRUST_NS_QUALIFIER::tabulate_output_iterator<detail::reduce::write_to_user_out_it<OutputIteratorT>>;
implicit_cast_kv_pair_op_it out_it =
THRUST_NS_QUALIFIER::make_tabulate_output_iterator(detail::reduce::write_to_user_out_it<OutputIteratorT>{d_out});

return detail::reduce::DispatchStreamingArgReduce<
InputIteratorT,
implicit_cast_kv_pair_op_it,
PerPartitionOffsetT,
GlobalOffsetT,
ReduceOpT,
InitT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
out_it,
static_cast<GlobalOffsetT>(num_items),
ReduceOpT{},
initial_value,
stream);
}

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
Expand All @@ -861,7 +906,7 @@ struct DeviceReduce
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
::cuda::std::int64_t num_items,
cudaStream_t stream,
bool debug_synchronous)
{
Expand Down
Loading
Loading