Skip to content

Commit

Permalink
Merge branch 'main' into execution_model
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalobg authored Feb 26, 2025
2 parents 5504adb + cd6a090 commit 9d4ac26
Show file tree
Hide file tree
Showing 301 changed files with 3,350 additions and 1,644 deletions.
5 changes: 3 additions & 2 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <cuda/std/__algorithm_>
#include <cuda/std/cstdint>
#include <cuda/std/functional> // ::cuda::std::identity
#include <cuda/std/variant>
Expand Down Expand Up @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);

// Implement part of MemBoundScaling
items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);

return {block_size, items_per_thread, vector_load_length};
}
Expand Down
4 changes: 1 addition & 3 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,10 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, unsigned short, bfloat16_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP

#ifdef __GNUC__
# pragma GCC diagnostic pop
Expand Down
2 changes: 1 addition & 1 deletion c2h/include/c2h/fill_striped.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ struct scalar_to_vec_t
template <int LogicalWarpThreads, int ItemsPerThread, int BlockThreads, typename IteratorT>
void fill_striped(IteratorT it)
{
using T = cub::detail::value_t<IteratorT>;
using T = cub::detail::it_value_t<IteratorT>;

constexpr int warps_in_block = BlockThreads / LogicalWarpThreads;
constexpr int items_per_warp = LogicalWarpThreads * ItemsPerThread;
Expand Down
5 changes: 2 additions & 3 deletions c2h/include/c2h/half.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,11 +361,10 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
struct CUB_NS_QUALIFIER::NumericTraits<half_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, unsigned short, half_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP

#ifdef __GNUC__
# pragma GCC diagnostic pop
Expand Down
19 changes: 1 addition & 18 deletions c2h/include/c2h/test_util_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
C2H_VEC_OVERLOAD(float, float)
C2H_VEC_OVERLOAD(double, double)

// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
// Specialize cuda::std::numeric_limits for vector types.

# define REPEAT_TO_LIST_1(a) a
# define REPEAT_TO_LIST_2(a) a, a
Expand All @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
# define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)

# define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \
CUB_NAMESPACE_BEGIN \
template <> \
struct NumericTraits<T> \
{ \
static __host__ __device__ T Max() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())}; \
return retval; \
} \
static __host__ __device__ T Lowest() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())}; \
return retval; \
} \
}; \
CUB_NAMESPACE_END \
\
_LIBCUDACXX_BEGIN_NAMESPACE_STD \
template <> \
class numeric_limits<T> \
Expand Down
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>
#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +64,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
3 changes: 2 additions & 1 deletion cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>
#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +64,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
5 changes: 4 additions & 1 deletion cub/benchmarks/bench/reduce/arg_extrema.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cub/device/device_reduce.cuh>
#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>

#include <cuda/std/limits>
#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -57,7 +58,9 @@ struct policy_hub_t
// Type used for the final result
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;

auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
? ::cuda::std::numeric_limits<T>::max()
: ::cuda::std::numeric_limits<T>::lowest();

#if !TUNE_BASE
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/select/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -61,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/select/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <thrust/count.h>

#include <cuda/std/__algorithm_>

#include <limits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -63,7 +65,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
6 changes: 4 additions & 2 deletions cub/benchmarks/bench/select/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <cub/device/device_select.cuh>

#include <cuda/std/__algorithm_>

#include <limits>

#include <look_back_helper.cuh>
Expand Down Expand Up @@ -36,8 +38,8 @@ struct policy_hub_t
{
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
12 changes: 5 additions & 7 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -555,18 +555,16 @@ private:
// TYPE DECLARATIONS
//---------------------------------------------------------------------
/// Internal load/store type. For byte-wise memcpy, a single-byte type
using AliasT =
typename ::cuda::std::conditional<IsMemcpy,
std::iterator_traits<char*>,
std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
using AliasT = typename ::cuda::std::
conditional_t<IsMemcpy, ::cuda::std::type_identity<char>, lazy_trait<it_value_t, it_value_t<InputBufferIt>>>::type;

/// Types of the input and output buffers
using InputBufferT = cub::detail::value_t<InputBufferIt>;
using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
using InputBufferT = it_value_t<InputBufferIt>;
using OutputBufferT = it_value_t<OutputBufferIt>;

/// Type that has to be sufficiently large to hold any of the buffers' sizes.
/// The BufferSizeIteratorT's value type must be convertible to this type.
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
using BufferSizeT = it_value_t<BufferSizeIteratorT>;

/// Type used to index into the tile of buffers that this thread block is assigned to.
using BlockBufferOffsetT = uint16_t;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ struct AgentHistogram
//---------------------------------------------------------------------

/// The sample type of the input iterator
using SampleT = cub::detail::value_t<SampleIteratorT>;
using SampleT = cub::detail::it_value_t<SampleIteratorT>;

/// The pixel type of SampleT
using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ struct agent_t
using policy = Policy;

// key and value type are taken from the first input sequence (consistent with old Thrust behavior)
using key_type = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
using key_type = it_value_t<KeysIt1>;
using item_type = it_value_t<ItemsIt1>;

using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
Expand Down
26 changes: 13 additions & 13 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/std/__algorithm_>

CUB_NAMESPACE_BEGIN

Expand All @@ -66,7 +67,7 @@ struct AgentRadixSortHistogramPolicy
* ID. However, lanes with the same ID in different warp use the same private
* histogram. This arrangement helps reduce the degree of conflicts in atomic
* operations. */
NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
NUM_PARTS = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)),
RADIX_BITS = _RADIX_BITS,
};
};
Expand Down Expand Up @@ -94,16 +95,13 @@ template <typename AgentRadixSortHistogramPolicy,
struct AgentRadixSortHistogram
{
// constants
enum
{
ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS,
};
static constexpr int ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD;
static constexpr int BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS;
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS;
static constexpr int RADIX_DIGITS = 1 << RADIX_BITS;
static constexpr int MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS;
static constexpr int NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS;

using traits = radix::traits_t<KeyT>;
using bit_ordered_type = typename traits::bit_ordered_type;
Expand Down Expand Up @@ -210,7 +208,9 @@ struct AgentRadixSortHistogram
#pragma unroll
for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
{
int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
// FIXME(bgruber): the following replacement changes SASS for cub.test.device_radix_sort_pairs.lid_0
// const int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit);
const int num_bits = CUB_MIN(+RADIX_BITS, end_bit - current_bit);
#pragma unroll
for (int u = 0; u < ITEMS_PER_THREAD; ++u)
{
Expand Down Expand Up @@ -258,7 +258,7 @@ struct AgentRadixSortHistogram

// Process the tiles.
OffsetT portion_offset = portion * MAX_PORTION_SIZE;
OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
OffsetT portion_size = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset);
for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
{
OffsetT tile_offset = portion_offset + offset;
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
#include <cub/warp/warp_reduce.cuh>

#include <cuda/ptx>
#include <cuda/std/__algorithm_>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -160,17 +161,17 @@ struct AgentRadixSortUpsweep
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,

LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
COUNTER_LANES = 1 << LOG_COUNTER_LANES,

// To prevent counter overflow, we must periodically unpack and aggregate the
// digit counters back into registers. Each counter lane is assigned to a
// warp for aggregation.

LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS),

// Unroll tiles in batches without risk of counter overflow
UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
UNROLL_COUNT = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD),
UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
};

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ struct AgentReduce
//---------------------------------------------------------------------

/// The input value type
using InputT = value_t<InputIteratorT>;
using InputT = it_value_t<InputIteratorT>;

/// Vector type of InputT for data movement
using VectorT = typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
Expand All @@ -159,7 +159,7 @@ struct AgentReduce
static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD;
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
static constexpr int VECTOR_LOAD_LENGTH = _CUDA_VSTD::min(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);

// Can vectorize according to the policy if the input iterator is a native
// pointer to a primitive type
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,13 @@ struct AgentReduceByKey
//---------------------------------------------------------------------

// The input keys type
using KeyInputT = value_t<KeysInputIteratorT>;
using KeyInputT = it_value_t<KeysInputIteratorT>;

// The output keys type
using KeyOutputT = non_void_value_t<UniqueOutputIteratorT, KeyInputT>;

// The input values type
using ValueInputT = value_t<ValuesInputIteratorT>;
using ValueInputT = it_value_t<ValuesInputIteratorT>;

// Tuple type for scanning (pairs accumulated segment-value with
// segment-index)
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ struct AgentRle
//---------------------------------------------------------------------

/// The input value type
using T = cub::detail::value_t<InputIteratorT>;
using T = cub::detail::it_value_t<InputIteratorT>;

/// The lengths output value type
using LengthT = cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ struct AgentScan
//---------------------------------------------------------------------

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
using InputT = cub::detail::it_value_t<InputIteratorT>;

// Tile status descriptor interface type
using ScanTileStateT = ScanTileState<AccumT>;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ struct AgentScanByKey
// Types and constants
//---------------------------------------------------------------------

using KeyT = value_t<KeysInputIteratorT>;
using InputT = value_t<ValuesInputIteratorT>;
using KeyT = it_value_t<KeysInputIteratorT>;
using InputT = it_value_t<ValuesInputIteratorT>;
using FlagValuePairT = KeyValuePair<int, AccumT>;
using ReduceBySegmentOpT = ScanBySegmentOp<ScanOpT>;

Expand Down
Loading

0 comments on commit 9d4ac26

Please sign in to comment.