Skip to content

Commit

Permalink
Replace cub::Traits by numeric_limits and deprecate
Browse files Browse the repository at this point in the history
* Consistently use ::cuda::std::numeric_limits in CUB

Fixes: NVIDIA#3381
  • Loading branch information
bernhardmgruber committed Feb 17, 2025
1 parent e0f8614 commit 25e8ee6
Show file tree
Hide file tree
Showing 20 changed files with 285 additions and 155 deletions.
27 changes: 15 additions & 12 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ struct bfloat16_t
}
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif

/******************************************************************************
* I/O stream overloads
******************************************************************************/
Expand All @@ -230,18 +234,17 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
}

/******************************************************************************
* Traits overloads
* traits and limits
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct __is_extended_floating_point<bfloat16_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
#ifndef _CCCL_NO_INLINE_VARIABLES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES
#endif // _CCCL_NO_INLINE_VARIABLES

template <>
class numeric_limits<bfloat16_t>
Expand All @@ -266,13 +269,13 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP
struct CUB_NS_QUALIFIER::detail::unsigned_bits<bfloat16_t, void>
{
using type = unsigned short;
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
// template <>
// struct CUB_NS_QUALIFIER::detail::NumericTraits<bfloat16_t>
// : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
// {};
26 changes: 15 additions & 11 deletions c2h/include/c2h/half.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ struct half_t
}
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif

/******************************************************************************
* I/O stream overloads
******************************************************************************/
Expand All @@ -325,18 +329,17 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
}

/******************************************************************************
* Traits overloads
* traits and limits
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct __is_extended_floating_point<half_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
#ifndef _CCCL_NO_INLINE_VARIABLES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES
#endif // _CCCL_NO_INLINE_VARIABLES

template <>
class numeric_limits<half_t>
Expand All @@ -361,12 +364,13 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP
struct CUB_NS_QUALIFIER::detail::unsigned_bits<half_t, void>
{
using type = unsigned short;
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
// template <>
// struct CUB_NS_QUALIFIER::detail::NumericTraits<half_t>
// : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, half_t>
// {};
19 changes: 1 addition & 18 deletions c2h/include/c2h/test_util_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
C2H_VEC_OVERLOAD(float, float)
C2H_VEC_OVERLOAD(double, double)

// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
// Specialize cuda::std::numeric_limits for vector types.

# define REPEAT_TO_LIST_1(a) a
# define REPEAT_TO_LIST_2(a) a, a
Expand All @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
# define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)

# define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \
CUB_NAMESPACE_BEGIN \
template <> \
struct NumericTraits<T> \
{ \
static __host__ __device__ T Max() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())}; \
return retval; \
} \
static __host__ __device__ T Lowest() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())}; \
return retval; \
} \
}; \
CUB_NAMESPACE_END \
\
_LIBCUDACXX_BEGIN_NAMESPACE_STD \
template <> \
class numeric_limits<T> \
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/reduce/arg_extrema.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ struct policy_hub_t
// Type used for the final result
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;

auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
? ::cuda::std::numeric_limits<T>::max()
: ::cuda::std::numeric_limits<T>::lowest();

#if !TUNE_BASE
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ struct AgentReduceByKey
// Whether or not the scan operation has a zero-valued identity value (true
// if we're performing addition on a primitive type)
static constexpr int HAS_IDENTITY_ZERO =
(::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && (is_primitive<AccumT>::value);
(::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && is_primitive<AccumT>::value;

// Cache-modified Input iterator wrapper type (for applying cache modifier)
// for keys Wrap the native input pointer with
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class AgentSubWarpSort

_CCCL_DEVICE static bool get_oob_default(::cuda::std::true_type /* is bool */)
{
// Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
// key_traits<KeyT>::max_key for `bool` is 0xFF which is different from `true` and makes
// comparison with oob unreliable.
return !IS_DESCENDING;
}
Expand Down
36 changes: 21 additions & 15 deletions cub/cub/block/radix_rank_sort_operations.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/cstdint>
#include <cuda/std/tuple>
#include <cuda/std/type_traits>
#include <cuda/type_traits>
#include <cuda/type_traits>

CUB_NAMESPACE_BEGIN
Expand All @@ -77,8 +77,7 @@ CUB_NAMESPACE_BEGIN
template <typename KeyT, bool IsFP = ::cuda::is_floating_point_v<KeyT>>
struct BaseDigitExtractor
{
using TraitsT = Traits<KeyT>;
using UnsignedBits = typename TraitsT::UnsignedBits;
using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
Expand All @@ -89,14 +88,13 @@ struct BaseDigitExtractor
template <typename KeyT>
struct BaseDigitExtractor<KeyT, true>
{
using TraitsT = Traits<KeyT>;
using UnsignedBits = typename TraitsT::UnsignedBits;
using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
key_traits<KeyT>::twiddle_in(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = key_traits<KeyT>::twiddle_in(0);
return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
}
};
Expand Down Expand Up @@ -209,7 +207,7 @@ struct is_fundamental_type
};

template <class T>
struct is_fundamental_type<T, ::cuda::std::void_t<typename Traits<T>::UnsignedBits>>
struct is_fundamental_type<T, ::cuda::std::void_t<typename key_traits<T>::unsigned_bits>>
{
static constexpr bool value = true;
};
Expand All @@ -233,23 +231,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_
template <class T>
struct bit_ordered_conversion_policy_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;

static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
{
return Traits<T>::TwiddleIn(val);
return key_traits<T>::twiddle_in(val);
}

static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
{
return Traits<T>::TwiddleOut(val);
return key_traits<T>::twiddle_out(val);
}
};

template <class T>
struct bit_ordered_inversion_policy_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;

static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
{
Expand All @@ -260,7 +258,7 @@ struct bit_ordered_inversion_policy_t
template <class T, bool = is_fundamental_type<T>::value>
struct traits_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;
using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
using bit_ordered_inversion_policy = bit_ordered_inversion_policy_t<T>;

Expand All @@ -269,12 +267,20 @@ struct traits_t

static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t)
{
return Traits<T>::LOWEST_KEY;
// TODO(bgruber): sanity check, remove eventually
_CCCL_SUPPRESS_DEPRECATED_PUSH
static_assert(key_traits<T>::lowest_key == Traits<T>::LOWEST_KEY, "");
_CCCL_SUPPRESS_DEPRECATED_POP
return key_traits<T>::lowest_key;
}

static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t)
{
return Traits<T>::MAX_KEY;
// TODO(bgruber): sanity check, remove eventually
_CCCL_SUPPRESS_DEPRECATED_PUSH
static_assert(key_traits<T>::max_key == Traits<T>::MAX_KEY, "");
_CCCL_SUPPRESS_DEPRECATED_POP
return key_traits<T>::max_key;
}

static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t)
Expand Down
23 changes: 10 additions & 13 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@

#include <thrust/iterator/tabulate_output_iterator.h>

#include <cuda/std/limits>

#include <iterator>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -334,7 +336,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide minimum using the less-than (``<``) operator.
//!
//! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -433,8 +435,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::minimum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
Traits<InitT>::Max(),
::cuda::std::numeric_limits<InitT>::max(),
stream);
}

Expand Down Expand Up @@ -583,7 +584,7 @@ struct DeviceReduce
//! (assuming the value type of ``d_in`` is ``T``)
//!
//! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -690,8 +691,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
Expand All @@ -700,7 +700,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide maximum using the greater-than (``>``) operator.
//!
//! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -796,8 +796,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::maximum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
Traits<InitT>::Lowest(),
::cuda::std::numeric_limits<InitT>::lowest(),
stream);
}

Expand Down Expand Up @@ -948,7 +947,7 @@ struct DeviceReduce
//!
//! - The maximum is written to ``d_out.value`` and its offset in the input
//! array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -1057,9 +1056,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
Expand Down
Loading

0 comments on commit 25e8ee6

Please sign in to comment.