From cc4b86eef90e0f863ecc2ca06ef4f56ae77c9174 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 27 Jan 2025 21:06:02 +0100 Subject: [PATCH 1/8] Backport to 2.8: Implement `cuda::std::numeric_limits` for `__half` and `__nv_bfloat16` (#3361) (#3490) With a dedicated C++11 fix Co-authored-by: Michael Schellenberger Costa --- libcudacxx/include/cuda/std/limits | 207 +++++++++++++++- .../std/containers/views/mdspan/my_int.hpp | 15 ++ .../limits/is_specialized.pass.cpp | 7 + .../limits/numeric.limits.members/common.h | 41 ++++ .../const_data_members.pass.cpp | 225 +++++------------- .../denorm_min.pass.cpp | 15 +- .../numeric.limits.members/digits.pass.cpp | 7 +- .../numeric.limits.members/digits10.pass.cpp | 6 + .../numeric.limits.members/epsilon.pass.cpp | 15 +- .../has_denorm.pass.cpp | 6 + .../has_denorm_loss.pass.cpp | 6 + .../has_infinity.pass.cpp | 6 + .../has_quiet_NaN.pass.cpp | 6 + .../has_signaling_NaN.pass.cpp | 6 + .../numeric.limits.members/infinity.pass.cpp | 26 +- .../is_bounded.pass.cpp | 6 + .../numeric.limits.members/is_exact.pass.cpp | 6 + .../numeric.limits.members/is_iec559.pass.cpp | 6 + .../is_integer.pass.cpp | 6 + .../numeric.limits.members/is_modulo.pass.cpp | 6 + .../numeric.limits.members/is_signed.pass.cpp | 6 + .../numeric.limits.members/lowest.pass.cpp | 16 +- .../numeric.limits.members/max.pass.cpp | 15 +- .../max_digits10.pass.cpp | 6 + .../max_exponent.pass.cpp | 6 + .../max_exponent10.pass.cpp | 6 + .../numeric.limits.members/min.pass.cpp | 15 +- .../min_exponent.pass.cpp | 6 + .../min_exponent10.pass.cpp | 6 + .../numeric.limits.members/quiet_NaN.pass.cpp | 10 +- .../numeric.limits.members/radix.pass.cpp | 6 + .../round_error.pass.cpp | 15 +- .../round_style.pass.cpp | 6 + .../signaling_NaN.pass.cpp | 10 +- .../tinyness_before.pass.cpp | 6 + .../numeric.limits.members/traps.pass.cpp | 6 + 36 files changed, 564 insertions(+), 201 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits index b5f6d847434..5e4df32270e 100644 --- a/libcudacxx/include/cuda/std/limits +++ b/libcudacxx/include/cuda/std/limits @@ -22,7 +22,10 @@ #endif // no system header #include -#include +#include +#include +#include +#include #include #include @@ -46,7 +49,47 @@ enum float_denorm_style denorm_present = 1 }; -template ::value> +enum class __numeric_limits_type +{ + __integral, + __bool, + __floating_point, + __other, +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type() +{ +#if !defined(_CCCL_NO_IF_CONSTEXPR) + _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool)) + { + return __numeric_limits_type::__bool; + } + else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp)) + { + return __numeric_limits_type::__integral; + } + else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)) + { + return __numeric_limits_type::__floating_point; + } + else + { + return __numeric_limits_type::__other; + } + _CCCL_UNREACHABLE(); +#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv + return _CCCL_TRAIT(is_same, _Tp, bool) + ? __numeric_limits_type::__bool + : (_CCCL_TRAIT(is_integral, _Tp) + ? __numeric_limits_type::__integral + : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp) + ? __numeric_limits_type::__floating_point + : __numeric_limits_type::__other)); +#endif // _CCCL_NO_IF_CONSTEXPR +} + +template ()> class __numeric_limits_impl { public: @@ -135,7 +178,7 @@ struct __int_min<_Tp, __digits, false> }; template -class __numeric_limits_impl<_Tp, true> +class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral> { public: using type = _Tp; @@ -212,7 +255,7 @@ public: }; template <> -class __numeric_limits_impl +class __numeric_limits_impl { public: using type = bool; @@ -286,7 +329,7 @@ public: }; template <> -class __numeric_limits_impl +class __numeric_limits_impl { public: using type = float; @@ -381,7 +424,7 @@ public: }; template <> -class __numeric_limits_impl +class __numeric_limits_impl { public: using type = double; @@ -476,7 +519,7 @@ public: }; template <> -class __numeric_limits_impl +class __numeric_limits_impl { #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE @@ -551,6 +594,156 @@ public: #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE }; +#if defined(_LIBCUDACXX_HAS_NVFP16) +template <> +class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point> +{ +public: + using type = __half; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type(__half_raw{0x0400u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type(__half_raw{0x7bffu}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type(__half_raw{0xfbffu}); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(__half_raw{0x1400u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(__half_raw{0x3800u}); + } + + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(__half_raw{0x7c00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(__half_raw{0x7e00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(__half_raw{0x7d00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(__half_raw{0x0001u}); + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +template <> +class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point> +{ +public: + using type = __nv_bfloat16; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 8; + static constexpr int digits10 = 2; + static constexpr int max_digits10 = 4; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type(__nv_bfloat16_raw{0x0080u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type(__nv_bfloat16_raw{0x7f7fu}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type(__nv_bfloat16_raw{0xff7fu}); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(__nv_bfloat16_raw{0x3c00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(__nv_bfloat16_raw{0x3f00u}); + } + + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(__nv_bfloat16_raw{0x7f80u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(__nv_bfloat16_raw{0x7fc0u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(__nv_bfloat16_raw{0x7fa0u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(__nv_bfloat16_raw{0x0001u}); + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +#endif // _LIBCUDACXX_HAS_NVBF16 + template class numeric_limits : public __numeric_limits_impl<_Tp> {}; diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp index 4f27784cd61..df34fa1d42e 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp @@ -1,6 +1,9 @@ #ifndef _MY_INT_HPP #define _MY_INT_HPP +#include +#include + #include "test_macros.h" struct my_int_non_convertible; @@ -22,6 +25,10 @@ template <> struct cuda::std::is_integral : cuda::std::true_type {}; +template <> +class cuda::std::numeric_limits : public cuda::std::numeric_limits +{}; + // Wrapper type that's not implicitly convertible struct my_int_non_convertible @@ -43,6 +50,10 @@ template <> struct cuda::std::is_integral : cuda::std::true_type {}; +template <> +class cuda::std::numeric_limits : public cuda::std::numeric_limits +{}; + // Wrapper type that's not nothrow-constructible struct my_int_non_nothrow_constructible @@ -62,4 +73,8 @@ template <> struct cuda::std::is_integral : cuda::std::true_type {}; +template <> +class cuda::std::numeric_limits : public cuda::std::numeric_limits +{}; + #endif diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp index 2ecd59004bb..7113c0e2772 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp @@ -68,6 +68,13 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16 + static_assert(!cuda::std::numeric_limits>::is_specialized, "!cuda::std::numeric_limits >::is_specialized"); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h new file mode 100644 index 00000000000..15b48836839 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef NUMERIC_LIMITS_MEMBERS_COMMON_H +#define NUMERIC_LIMITS_MEMBERS_COMMON_H + +// Disable all the extended floating point operations and conversions +#define __CUDA_NO_HALF_CONVERSIONS__ 1 +#define __CUDA_NO_HALF_OPERATORS__ 1 +#define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1 +#define __CUDA_NO_BFLOAT16_OPERATORS__ 1 + +#include + +template +__host__ __device__ bool float_eq(T x, T y) +{ + return x == y; +} + +#if defined(_LIBCUDACXX_HAS_NVFP16) +__host__ __device__ inline bool float_eq(__half x, __half y) +{ + return __heq(x, y); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +__host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y) +{ + return __heq(x, y); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +#endif // NUMERIC_LIMITS_MEMBERS_COMMON_H diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp index 8db1a9f5f0c..769080cff83 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp @@ -42,173 +42,80 @@ template __host__ __device__ void test(T) {} -#define TEST_NUMERIC_LIMITS(type) \ - test(cuda::std::numeric_limits::is_specialized); \ - test(cuda::std::numeric_limits::digits); \ - test(cuda::std::numeric_limits::digits10); \ - test(cuda::std::numeric_limits::max_digits10); \ - test(cuda::std::numeric_limits::is_signed); \ - test(cuda::std::numeric_limits::is_integer); \ - test(cuda::std::numeric_limits::is_exact); \ - test(cuda::std::numeric_limits::radix); \ - test(cuda::std::numeric_limits::min_exponent); \ - test(cuda::std::numeric_limits::min_exponent10); \ - test(cuda::std::numeric_limits::max_exponent); \ - test(cuda::std::numeric_limits::max_exponent10); \ - test(cuda::std::numeric_limits::has_infinity); \ - test(cuda::std::numeric_limits::has_quiet_NaN); \ - test(cuda::std::numeric_limits::has_signaling_NaN); \ - test(cuda::std::numeric_limits::has_denorm); \ - test(cuda::std::numeric_limits::has_denorm_loss); \ - test(cuda::std::numeric_limits::is_iec559); \ - test(cuda::std::numeric_limits::is_bounded); \ - test(cuda::std::numeric_limits::is_modulo); \ - test(cuda::std::numeric_limits::traps); \ - test(cuda::std::numeric_limits::tinyness_before); \ - test(cuda::std::numeric_limits::round_style); +template +__host__ __device__ void test_type_helper() +{ + test(cuda::std::numeric_limits::is_specialized); + test(cuda::std::numeric_limits::digits); + test(cuda::std::numeric_limits::digits10); + test(cuda::std::numeric_limits::max_digits10); + test(cuda::std::numeric_limits::is_signed); + test(cuda::std::numeric_limits::is_integer); + test(cuda::std::numeric_limits::is_exact); + test(cuda::std::numeric_limits::radix); + test(cuda::std::numeric_limits::min_exponent); + test(cuda::std::numeric_limits::min_exponent10); + test(cuda::std::numeric_limits::max_exponent); + test(cuda::std::numeric_limits::max_exponent10); + test(cuda::std::numeric_limits::has_infinity); + test(cuda::std::numeric_limits::has_quiet_NaN); + test(cuda::std::numeric_limits::has_signaling_NaN); + test(cuda::std::numeric_limits::has_denorm); + test(cuda::std::numeric_limits::has_denorm_loss); + test(cuda::std::numeric_limits::is_iec559); + test(cuda::std::numeric_limits::is_bounded); + test(cuda::std::numeric_limits::is_modulo); + test(cuda::std::numeric_limits::traps); + test(cuda::std::numeric_limits::tinyness_before); + test(cuda::std::numeric_limits::round_style); +} + +template +__host__ __device__ void test_type() +{ + test_type_helper(); + test_type_helper(); + test_type_helper(); + test_type_helper(); +} struct other {}; int main(int, char**) { - // bool - TEST_NUMERIC_LIMITS(bool) - TEST_NUMERIC_LIMITS(const bool) - TEST_NUMERIC_LIMITS(volatile bool) - TEST_NUMERIC_LIMITS(const volatile bool) - - // char - TEST_NUMERIC_LIMITS(char) - TEST_NUMERIC_LIMITS(const char) - TEST_NUMERIC_LIMITS(volatile char) - TEST_NUMERIC_LIMITS(const volatile char) - - // signed char - TEST_NUMERIC_LIMITS(signed char) - TEST_NUMERIC_LIMITS(const signed char) - TEST_NUMERIC_LIMITS(volatile signed char) - TEST_NUMERIC_LIMITS(const volatile signed char) - - // unsigned char - TEST_NUMERIC_LIMITS(unsigned char) - TEST_NUMERIC_LIMITS(const unsigned char) - TEST_NUMERIC_LIMITS(volatile unsigned char) - TEST_NUMERIC_LIMITS(const volatile unsigned char) - - // wchar_t - TEST_NUMERIC_LIMITS(wchar_t) - TEST_NUMERIC_LIMITS(const wchar_t) - TEST_NUMERIC_LIMITS(volatile wchar_t) - TEST_NUMERIC_LIMITS(const volatile wchar_t) - -#if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - // char8_t - TEST_NUMERIC_LIMITS(char8_t) - TEST_NUMERIC_LIMITS(const char8_t) - TEST_NUMERIC_LIMITS(volatile char8_t) - TEST_NUMERIC_LIMITS(const volatile char8_t) -#endif - - // char16_t - TEST_NUMERIC_LIMITS(char16_t) - TEST_NUMERIC_LIMITS(const char16_t) - TEST_NUMERIC_LIMITS(volatile char16_t) - TEST_NUMERIC_LIMITS(const volatile char16_t) - - // char32_t - TEST_NUMERIC_LIMITS(char32_t) - TEST_NUMERIC_LIMITS(const char32_t) - TEST_NUMERIC_LIMITS(volatile char32_t) - TEST_NUMERIC_LIMITS(const volatile char32_t) - - // short - TEST_NUMERIC_LIMITS(short) - TEST_NUMERIC_LIMITS(const short) - TEST_NUMERIC_LIMITS(volatile short) - TEST_NUMERIC_LIMITS(const volatile short) - - // int - TEST_NUMERIC_LIMITS(int) - TEST_NUMERIC_LIMITS(const int) - TEST_NUMERIC_LIMITS(volatile int) - TEST_NUMERIC_LIMITS(const volatile int) - - // long - TEST_NUMERIC_LIMITS(long) - TEST_NUMERIC_LIMITS(const long) - TEST_NUMERIC_LIMITS(volatile long) - TEST_NUMERIC_LIMITS(const volatile long) - -#ifndef _LIBCUDACXX_HAS_NO_INT128 - TEST_NUMERIC_LIMITS(__int128_t) - TEST_NUMERIC_LIMITS(const __int128_t) - TEST_NUMERIC_LIMITS(volatile __int128_t) - TEST_NUMERIC_LIMITS(const volatile __int128_t) -#endif - - // long long - TEST_NUMERIC_LIMITS(long long) - TEST_NUMERIC_LIMITS(const long long) - TEST_NUMERIC_LIMITS(volatile long long) - TEST_NUMERIC_LIMITS(const volatile long long) - - // unsigned short - TEST_NUMERIC_LIMITS(unsigned short) - TEST_NUMERIC_LIMITS(const unsigned short) - TEST_NUMERIC_LIMITS(volatile unsigned short) - TEST_NUMERIC_LIMITS(const volatile unsigned short) - - // unsigned int - TEST_NUMERIC_LIMITS(unsigned int) - TEST_NUMERIC_LIMITS(const unsigned int) - TEST_NUMERIC_LIMITS(volatile unsigned int) - TEST_NUMERIC_LIMITS(const volatile unsigned int) - - // unsigned long - TEST_NUMERIC_LIMITS(unsigned long) - TEST_NUMERIC_LIMITS(const unsigned long) - TEST_NUMERIC_LIMITS(volatile unsigned long) - TEST_NUMERIC_LIMITS(const volatile unsigned long) - - // unsigned long long - TEST_NUMERIC_LIMITS(unsigned long long) - TEST_NUMERIC_LIMITS(const unsigned long long) - TEST_NUMERIC_LIMITS(volatile unsigned long long) - TEST_NUMERIC_LIMITS(const volatile unsigned long long) - + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); +#if TEST_STD_VER >= 2020 && defined(__cpp_char8_t) + test_type(); +#endif // TEST_STD_VER >= 2020 && defined(__cpp_char8_t) + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - TEST_NUMERIC_LIMITS(__uint128_t) - TEST_NUMERIC_LIMITS(const __uint128_t) - TEST_NUMERIC_LIMITS(volatile __uint128_t) - TEST_NUMERIC_LIMITS(const volatile __uint128_t) -#endif - - // float - TEST_NUMERIC_LIMITS(float) - TEST_NUMERIC_LIMITS(const float) - TEST_NUMERIC_LIMITS(volatile float) - TEST_NUMERIC_LIMITS(const volatile float) - - // double - TEST_NUMERIC_LIMITS(double) - TEST_NUMERIC_LIMITS(const double) - TEST_NUMERIC_LIMITS(volatile double) - TEST_NUMERIC_LIMITS(const volatile double) - + test_type<__int128_t>(); +#endif // _LIBCUDACXX_HAS_NO_INT128 + test_type(); + test_type(); #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE - // long double - TEST_NUMERIC_LIMITS(long double) - TEST_NUMERIC_LIMITS(const long double) - TEST_NUMERIC_LIMITS(volatile long double) - TEST_NUMERIC_LIMITS(const volatile long double) -#endif - - // other - TEST_NUMERIC_LIMITS(other) - TEST_NUMERIC_LIMITS(const other) - TEST_NUMERIC_LIMITS(volatile other) - TEST_NUMERIC_LIMITS(const volatile other) + test_type(); +#endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE +#if defined(_LIBCUDACXX_HAS_NVFP16) + test_type<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test_type<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp index 730adc30d36..cc64ed14686 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp @@ -14,15 +14,16 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::denorm_min() == expected); - assert(cuda::std::numeric_limits::denorm_min() == expected); - assert(cuda::std::numeric_limits::denorm_min() == expected); - assert(cuda::std::numeric_limits::denorm_min() == expected); + assert(float_eq(cuda::std::numeric_limits::denorm_min(), expected)); + assert(float_eq(cuda::std::numeric_limits::denorm_min(), expected)); + assert(float_eq(cuda::std::numeric_limits::denorm_min(), expected)); + assert(float_eq(cuda::std::numeric_limits::denorm_min(), expected)); } int main(int, char**) @@ -65,6 +66,12 @@ int main(int, char**) test(LDBL_TRUE_MIN); # endif #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(5.9604644775390625e-08)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41)); +#endif // _LIBCUDACXX_HAS_NVBF16 #if !defined(__FLT_DENORM_MIN__) && !defined(FLT_TRUE_MIN) # error Test has no expected values for floating point types #endif diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp index 63ecf93515f..efce1ccf678 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp @@ -55,6 +55,11 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif - +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, 11>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, 8>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp index 3295686ea49..32990ece4b1 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp @@ -59,6 +59,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, 3>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, 2>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp index 5bc22e7f5f2..fa42c5e8fe6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp @@ -14,15 +14,16 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::epsilon() == expected); - assert(cuda::std::numeric_limits::epsilon() == expected); - assert(cuda::std::numeric_limits::epsilon() == expected); - assert(cuda::std::numeric_limits::epsilon() == expected); + assert(float_eq(cuda::std::numeric_limits::epsilon(), expected)); + assert(float_eq(cuda::std::numeric_limits::epsilon(), expected)); + assert(float_eq(cuda::std::numeric_limits::epsilon(), expected)); + assert(float_eq(cuda::std::numeric_limits::epsilon(), expected)); } int main(int, char**) @@ -56,6 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_EPSILON); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(0.0009765625)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(0.0078125)); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp index e62208d7e3b..5a0a05ab73b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, cuda::std::denorm_present>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, cuda::std::denorm_present>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp index 3a665fe2c9b..450e51b8111 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp index be41dabb02c..646f5e20160 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp index 2d13db35438..626b4110695 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp index d5cf5096bb7..20cd04d107e 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp index 2d1c29f6f31..34527e300c5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "common.h" + // MSVC has issues with producing INF with divisions by zero. #if defined(_MSC_VER) # include @@ -24,10 +26,10 @@ template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::infinity() == expected); - assert(cuda::std::numeric_limits::infinity() == expected); - assert(cuda::std::numeric_limits::infinity() == expected); - assert(cuda::std::numeric_limits::infinity() == expected); + assert(float_eq(cuda::std::numeric_limits::infinity(), expected)); + assert(float_eq(cuda::std::numeric_limits::infinity(), expected)); + assert(float_eq(cuda::std::numeric_limits::infinity(), expected)); + assert(float_eq(cuda::std::numeric_limits::infinity(), expected)); } int main(int, char**) @@ -62,6 +64,12 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(1. / 0.); # endif +# if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(1.0 / 0.0)); +# endif // _LIBCUDACXX_HAS_NVFP16 +# if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0)); +# endif // _LIBCUDACXX_HAS_NVBF16 // MSVC has issues with producing INF with divisions by zero. #else test(INFINITY); @@ -69,11 +77,13 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(INFINITY); # endif +# if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(INFINITY)); +# endif // _LIBCUDACXX_HAS_NVFP16 +# if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(INFINITY)); +# endif // _LIBCUDACXX_HAS_NVBF16 #endif return 0; } - -#ifndef TEST_COMPILER_NVRTC -float zero = 0; -#endif diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp index 2dd4bd94fbc..9e671c5d905 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp index be45efae70c..cfc9a6cab90 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp index 6221cd6ed59..945347ff4b5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp index 3d166f31f28..65dd98fdb04 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp index 7b1adabf0c7..6d82269e1c8 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp index d7f98766343..eb39869bf24 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, true>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, true>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp index 6fec93e4a3d..e3b832dfd9b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp @@ -16,18 +16,19 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::lowest() == expected); + assert(float_eq(cuda::std::numeric_limits::lowest(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::lowest() == expected); + assert(float_eq(cuda::std::numeric_limits::lowest(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::lowest() == expected); + assert(float_eq(cuda::std::numeric_limits::lowest(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::lowest() == expected); + assert(float_eq(cuda::std::numeric_limits::lowest(), expected)); assert(cuda::std::numeric_limits::is_bounded); } @@ -35,6 +36,7 @@ int main(int, char**) { test(false); test(CHAR_MIN); + test(SCHAR_MIN); test(0); #ifndef TEST_COMPILER_NVRTC @@ -64,6 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(-LDBL_MAX); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(-65504.0)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38)); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp index 67c94051729..7ba6dabb1d2 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp @@ -16,18 +16,19 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::max() == expected); + assert(float_eq(cuda::std::numeric_limits::max(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::max() == expected); + assert(float_eq(cuda::std::numeric_limits::max(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::max() == expected); + assert(float_eq(cuda::std::numeric_limits::max(), expected)); assert(cuda::std::numeric_limits::is_bounded); - assert(cuda::std::numeric_limits::max() == expected); + assert(float_eq(cuda::std::numeric_limits::max(), expected)); assert(cuda::std::numeric_limits::is_bounded); } @@ -64,6 +65,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MAX); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(65504.0)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38)); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp index cd5892e6c8c..92b3d13ea61 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, 5>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, 4>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp index aeb9189d315..81d5ae07795 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, 16>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, 128>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp index ca0eb2917f6..4c426b37460 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, 4>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, 38>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp index 53d196d2a51..c24c3fde869 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp @@ -16,18 +16,19 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::min() == expected); + assert(float_eq(cuda::std::numeric_limits::min(), expected)); assert(cuda::std::numeric_limits::is_bounded || !cuda::std::numeric_limits::is_signed); - assert(cuda::std::numeric_limits::min() == expected); + assert(float_eq(cuda::std::numeric_limits::min(), expected)); assert(cuda::std::numeric_limits::is_bounded || !cuda::std::numeric_limits::is_signed); - assert(cuda::std::numeric_limits::min() == expected); + assert(float_eq(cuda::std::numeric_limits::min(), expected)); assert(cuda::std::numeric_limits::is_bounded || !cuda::std::numeric_limits::is_signed); - assert(cuda::std::numeric_limits::min() == expected); + assert(float_eq(cuda::std::numeric_limits::min(), expected)); assert(cuda::std::numeric_limits::is_bounded || !cuda::std::numeric_limits::is_signed); } @@ -65,6 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MIN); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(6.103515625e-05)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38)); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp index b075bcff87d..e3150f8dc8e 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, -13>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, -125>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp index c787cf4caab..cbca8e04171 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, -4>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, -37>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp index ce38b3ed60d..74e7f427941 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp @@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type) template __host__ __device__ inline void test() { - test_imp(cuda::std::is_floating_point()); + constexpr bool is_float = cuda::std::is_floating_point::value || cuda::std::__is_extended_floating_point::value; + + test_imp(cuda::std::integral_constant{}); } int main(int, char**) @@ -72,6 +74,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp index 5a709b3aefc..9765db6f760 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp @@ -55,6 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, FLT_RADIX>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, FLT_RADIX>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp index 01d10e80fb9..ba5049fc49f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp @@ -14,15 +14,16 @@ #include #include +#include "common.h" #include "test_macros.h" template __host__ __device__ void test(T expected) { - assert(cuda::std::numeric_limits::round_error() == expected); - assert(cuda::std::numeric_limits::round_error() == expected); - assert(cuda::std::numeric_limits::round_error() == expected); - assert(cuda::std::numeric_limits::round_error() == expected); + assert(float_eq(cuda::std::numeric_limits::round_error(), expected)); + assert(float_eq(cuda::std::numeric_limits::round_error(), expected)); + assert(float_eq(cuda::std::numeric_limits::round_error(), expected)); + assert(float_eq(cuda::std::numeric_limits::round_error(), expected)); } int main(int, char**) @@ -56,6 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(0.5); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(__double2half(0.5)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(__double2bfloat16(0.5)); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp index 7a7099662f0..3fb436381a7 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, cuda::std::round_to_nearest>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, cuda::std::round_to_nearest>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp index 164d54c5741..69ba66038de 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp @@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type) template __host__ __device__ inline void test() { - test_imp(cuda::std::is_floating_point()); + constexpr bool is_float = cuda::std::is_floating_point::value || cuda::std::__is_extended_floating_point::value; + + test_imp(cuda::std::integral_constant{}); } int main(int, char**) @@ -72,6 +74,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp index 70d832dc547..70cde2711a1 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp @@ -54,6 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp index 5c66acb56ce..7dd7eee68cc 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp @@ -60,6 +60,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif +#if defined(_LIBCUDACXX_HAS_NVFP16) + test<__half, false>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#if defined(_LIBCUDACXX_HAS_NVBF16) + test<__nv_bfloat16, false>(); +#endif // _LIBCUDACXX_HAS_NVBF16 return 0; } From 450b6a8462d0366afbd97930ba90677160a47e48 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Wed, 29 Jan 2025 15:42:37 -0800 Subject: [PATCH 2/8] Backport PRs #3201, #3523, #3547, #3580 to the 2.8.x branch. (#3536) * [FEA]: Introduce Python module with CCCL headers (#3201) * Add cccl/python/cuda_cccl directory and use from cuda_parallel, cuda_cooperative * Run `copy_cccl_headers_to_aude_include()` before `setup()` * Create python/cuda_cccl/cuda/_include/__init__.py, then simply import cuda._include to find the include path. * Add cuda.cccl._version exactly as for cuda.cooperative and cuda.parallel * Bug fix: cuda/_include only exists after shutil.copytree() ran. * Use `f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl"` in setup.py * Remove CustomBuildCommand, CustomWheelBuild in cuda_parallel/setup.py (they are equivalent to the default functions) * Replace := operator (needs Python 3.8+) * Fix oversights: remove `pip3 install ./cuda_cccl` lines from README.md * Restore original README.md: `pip3 install -e` now works on first pass. * cuda_cccl/README.md: FOR INTERNAL USE ONLY * Remove `$pymajor.$pyminor.` prefix in cuda_cccl _version.py (as suggested under https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894035917) Command used: ci/update_version.sh 2 8 0 * Modernize pyproject.toml, setup.py Trigger for this change: * https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894043178 * https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894044996 * Install CCCL headers under cuda.cccl.include Trigger for this change: * https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894048562 Unexpected accidental discovery: cuda.cooperative unit tests pass without CCCL headers entirely. * Factor out cuda_cccl/cuda/cccl/include_paths.py * Reuse cuda_cccl/cuda/cccl/include_paths.py from cuda_cooperative * Add missing Copyright notice. * Add missing __init__.py (cuda.cccl) * Add `"cuda.cccl"` to `autodoc.mock_imports` * Move cuda.cccl.include_paths into function where it is used. (Attempt to resolve Build and Verify Docs failure.) * Add # TODO: move this to a module-level import * Modernize cuda_cooperative/pyproject.toml, setup.py * Convert cuda_cooperative to use hatchling as build backend. * Revert "Convert cuda_cooperative to use hatchling as build backend." This reverts commit 61637d608da06fcf6851ef6197f88b5e7dbc3bbe. * Move numpy from [build-system] requires -> [project] dependencies * Move pyproject.toml [project] dependencies -> setup.py install_requires, to be able to use CCCL_PATH * Remove copy_license() and use license_files=["../../LICENSE"] instead. * Further modernize cuda_cccl/setup.py to use pathlib * Trivial simplifications in cuda_cccl/pyproject.toml * Further simplify cuda_cccl/pyproject.toml, setup.py: remove inconsequential code * Make cuda_cooperative/pyproject.toml more similar to cuda_cccl/pyproject.toml * Add taplo-pre-commit to .pre-commit-config.yaml * taplo-pre-commit auto-fixes * Use pathlib in cuda_cooperative/setup.py * CCCL_PYTHON_PATH in cuda_cooperative/setup.py * Modernize cuda_parallel/pyproject.toml, setup.py * Use pathlib in cuda_parallel/setup.py * Add `# TOML lint & format` comment. * Replace MANIFEST.in with `[tool.setuptools.package-data]` section in pyproject.toml * Use pathlib in cuda/cccl/include_paths.py * pre-commit autoupdate (EXCEPT clang-format, which was manually restored) * Fixes after git merge main * Resolve warning: AttributeError: '_Reduce' object has no attribute 'build_result' ``` =========================================================================== warnings summary =========================================================================== tests/test_reduce.py::test_reduce_non_contiguous /home/coder/cccl/python/devenv/lib/python3.12/site-packages/_pytest/unraisableexception.py:85: PytestUnraisableExceptionWarning: Exception ignored in: Traceback (most recent call last): File "/home/coder/cccl/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py", line 132, in __del__ bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result)) ^^^^^^^^^^^^^^^^^ AttributeError: '_Reduce' object has no attribute 'build_result' warnings.warn(pytest.PytestUnraisableExceptionWarning(msg)) -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ============================================================= 1 passed, 93 deselected, 1 warning in 0.44s ============================================================== ``` * Move `copy_cccl_headers_to_cuda_cccl_include()` functionality to `class CustomBuildPy` * Introduce cuda_cooperative/constraints.txt * Also add cuda_parallel/constraints.txt * Add `--constraint constraints.txt` in ci/test_python.sh * Update Copyright dates * Switch to https://github.com/ComPWA/taplo-pre-commit (the other repo has been archived by the owner on Jul 1, 2024) For completeness: The other repo took a long time to install into the pre-commit cache; so long it lead to timeouts in the CCCL CI. * Remove unused cuda_parallel jinja2 dependency (noticed by chance). * Remove constraints.txt files, advertise running `pip install cuda-cccl` first instead. * Make cuda_cooperative, cuda_parallel testing completely independent. * Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc] * Try using another runner (because V100 runners seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc] * Fix sign-compare warning (#3408) [skip-rapids][skip-matx][skip-docs][skip-vdc] * Revert "Try using another runner (because V100 runners seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]" This reverts commit ea33a218ed77a075156cd1b332047202adb25aa2. Error message: https://github.com/NVIDIA/cccl/pull/3201#issuecomment-2594012971 * Try using A100 runner (because V100 runners still seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc] * Also show cuda-cooperative site-packages, cuda-parallel site-packages (after pip install) [skip-rapids][skip-matx][skip-docs][skip-vdc] * Try using l4 runner (because V100 runners still seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc] * Restore original ci/matrix.yaml [skip-rapids] * Use for loop in test_python.sh to avoid code duplication. * Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci] * Comment out taplo-lint in pre-commit config [skip-rapids][skip-matx][skip-docs][skip-vdc] * Revert "Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci]" This reverts commit ec206fd8b50a6a293e00a5825b579e125010b13d. * Implement suggestion by @shwina (https://github.com/NVIDIA/cccl/pull/3201#pullrequestreview-2556918460) * Address feedback by @leofang --------- Co-authored-by: Bernhard Manfred Gruber * cuda.parallel: invoke pytest directly rather than via `python -m pytest` (#3523) Co-authored-by: Ashwin Srinath * Copy file from PR #3547 (bugfix/drop_pipe_in_lit by @wmaxey) * Revert "cuda.parallel: invoke pytest directly rather than via `python -m pytest` (#3523)" This reverts commit a2e21cbdd2fa15a35b3a0df8eb7e2fc84adc46bc. * Replace pipes.quote with shlex.quote in lit config (#3547) * Replace pipes.quote with shlex.quote * Drop TBB run on windows to unblock CI * Update ci/matrix.yaml Co-authored-by: Michael Schellenberger Costa Co-authored-by: Bernhard Manfred Gruber * Remove nvks runners from testing pool. (#3580) --------- Co-authored-by: Bernhard Manfred Gruber Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Co-authored-by: Ashwin Srinath Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Co-authored-by: Michael Schellenberger Costa Co-authored-by: Allison Piper --- .pre-commit-config.yaml | 11 +++ ci/matrix.yaml | 19 ++-- ci/test_python.sh | 33 ++++--- ci/update_version.sh | 2 + docs/repo.toml | 1 + .../test/utils/libcudacxx/test/config.py | 15 +-- python/cuda_cccl/.gitignore | 2 + python/cuda_cccl/README.md | 3 + python/cuda_cccl/cuda/cccl/__init__.py | 8 ++ python/cuda_cccl/cuda/cccl/_version.py | 7 ++ python/cuda_cccl/cuda/cccl/include_paths.py | 63 ++++++++++++ python/cuda_cccl/pyproject.toml | 29 ++++++ python/cuda_cccl/setup.py | 51 ++++++++++ python/cuda_cooperative/.gitignore | 1 - python/cuda_cooperative/MANIFEST.in | 1 - python/cuda_cooperative/README.md | 1 + .../cuda/cooperative/experimental/_nvrtc.py | 46 ++------- python/cuda_cooperative/pyproject.toml | 34 ++++++- python/cuda_cooperative/setup.py | 88 +---------------- python/cuda_parallel/.gitignore | 1 - python/cuda_parallel/MANIFEST.in | 1 - python/cuda_parallel/README.md | 1 + .../cuda/parallel/experimental/_bindings.py | 42 +------- .../experimental/algorithms/reduce.py | 5 + python/cuda_parallel/pyproject.toml | 32 +++++-- python/cuda_parallel/setup.py | 96 +++---------------- 26 files changed, 303 insertions(+), 290 deletions(-) create mode 100644 python/cuda_cccl/.gitignore create mode 100644 python/cuda_cccl/README.md create mode 100644 python/cuda_cccl/cuda/cccl/__init__.py create mode 100644 python/cuda_cccl/cuda/cccl/_version.py create mode 100644 python/cuda_cccl/cuda/cccl/include_paths.py create mode 100644 python/cuda_cccl/pyproject.toml create mode 100644 python/cuda_cccl/setup.py delete mode 100644 python/cuda_cooperative/MANIFEST.in delete mode 100644 python/cuda_parallel/MANIFEST.in diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7dd411ba39b..37ead30ca95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,17 @@ repos: hooks: - id: ruff # linter - id: ruff-format # formatter + + # TOML lint & format + - repo: https://github.com/ComPWA/taplo-pre-commit + rev: v0.9.3 + hooks: + # See https://github.com/NVIDIA/cccl/issues/3426 + # - id: taplo-lint + # exclude: "^docs/" + - id: taplo-format + exclude: "^docs/" + - repo: https://github.com/codespell-project/codespell rev: v2.3.0 hooks: diff --git a/ci/matrix.yaml b/ci/matrix.yaml index fd7f2f079c4..b04923fad9a 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -19,8 +19,9 @@ workflows: - {jobs: ['build'], std: 'max', cxx: ['intel', 'msvc2019']} - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']} - - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']} + - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']} + # Disabled until we figure out the issue with the TBB dll + #- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']} # Split up cub tests: - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']} - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']} @@ -278,13 +279,13 @@ projects: # testing -> Runner with GPU is in a nv-gh-runners testing pool gpus: - v100: { sm: 70 } # 32 GB, 40 runners - t4: { sm: 75, testing: true } # 16 GB, 8 runners - rtx2080: { sm: 75, testing: true } # 8 GB, 8 runners - rtxa6000: { sm: 86, testing: true } # 48 GB, 12 runners - l4: { sm: 89, testing: true } # 24 GB, 48 runners - rtx4090: { sm: 89, testing: true } # 24 GB, 10 runners - h100: { sm: 90, testing: true } # 80 GB, 16 runners + v100: { sm: 70 } # 32 GB, 40 runners + t4: { sm: 75 } # 16 GB, 10 runners + rtx2080: { sm: 75 } # 8 GB, 12 runners + rtxa6000: { sm: 86 } # 48 GB, 12 runners + l4: { sm: 89 } # 24 GB, 48 runners + rtx4090: { sm: 89 } # 24 GB, 10 runners + h100: { sm: 90 } # 80 GB, 16 runners # Tags are used to define a `matrix job` in the workflow section. # diff --git a/ci/test_python.sh b/ci/test_python.sh index bd66cc57716..34900fdb8e0 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -8,25 +8,28 @@ print_environment_details fail_if_no_gpu -readonly prefix="${BUILD_DIR}/python/" -export PYTHONPATH="${prefix}:${PYTHONPATH:-}" +begin_group "⚙️ Existing site-packages" +pip freeze +end_group "⚙️ Existing site-packages" -pushd ../python/cuda_cooperative >/dev/null +for module in cuda_parallel cuda_cooperative; do -run_command "⚙️ Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "🚀 Pytest cuda_cooperative" python -m pytest -v ./tests + pushd "../python/${module}" >/dev/null -popd >/dev/null + TEMP_VENV_DIR="/tmp/${module}_venv" + rm -rf "${TEMP_VENV_DIR}" + python -m venv "${TEMP_VENV_DIR}" + . "${TEMP_VENV_DIR}/bin/activate" + echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt + run_command "⚙️ Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test] + begin_group "⚙️ ${module} site-packages" + pip freeze + end_group "⚙️ ${module} site-packages" + run_command "🚀 Pytest ${module}" python -m pytest -v ./tests + deactivate -pushd ../python/cuda_parallel >/dev/null + popd >/dev/null -# Temporarily install the package twice to populate include directory as part of the first installation -# and to let manifest discover these includes during the second installation. Do not forget to remove the -# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed. -run_command "⚙️ Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "⚙️ Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "🚀 Pytest cuda_parallel" python -m pytest -v ./tests - -popd >/dev/null +done print_time_summary diff --git a/ci/update_version.sh b/ci/update_version.sh index c43303449bb..6a25a837d50 100755 --- a/ci/update_version.sh +++ b/ci/update_version.sh @@ -37,6 +37,7 @@ CUB_CMAKE_VERSION_FILE="lib/cmake/cub/cub-config-version.cmake" LIBCUDACXX_CMAKE_VERSION_FILE="lib/cmake/libcudacxx/libcudacxx-config-version.cmake" THRUST_CMAKE_VERSION_FILE="lib/cmake/thrust/thrust-config-version.cmake" CUDAX_CMAKE_VERSION_FILE="lib/cmake/cudax/cudax-config-version.cmake" +CUDA_CCCL_VERSION_FILE="python/cuda_cccl/cuda/cccl/_version.py" CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py" CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py" @@ -110,6 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" " update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)" +update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\"" update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" diff --git a/docs/repo.toml b/docs/repo.toml index f5be6925fb5..3313723c527 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -348,6 +348,7 @@ autodoc.mock_imports = [ "numba", "pynvjitlink", "cuda.bindings", + "cuda.cccl", "llvmlite", "numpy", ] diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py index d52b869dd17..af90b9fcbec 100644 --- a/libcudacxx/test/utils/libcudacxx/test/config.py +++ b/libcudacxx/test/utils/libcudacxx/test/config.py @@ -8,7 +8,6 @@ import ctypes import os -import pipes import platform import re import shlex @@ -423,10 +422,6 @@ def configure_cxx(self): ) ) - if "icc" in self.config.available_features: - self.cxx.link_flags += ["-lirc"] - self.cxx.compile_flags += ["-Xcompiler=-diag-disable=10441"] - def _configure_clang_cl(self, clang_path): def _split_env_var(var): return [p.strip() for p in os.environ.get(var, "").split(";") if p.strip()] @@ -1516,14 +1511,14 @@ def configure_modules(self): def configure_substitutions(self): sub = self.config.substitutions - cxx_path = pipes.quote(self.cxx.path) + cxx_path = shlex.quote(self.cxx.path) # Configure compiler substitutions sub.append(("%cxx", cxx_path)) sub.append(("%libcxx_src_root", self.libcudacxx_src_root)) # Configure flags substitutions - flags_str = " ".join([pipes.quote(f) for f in self.cxx.flags]) - compile_flags_str = " ".join([pipes.quote(f) for f in self.cxx.compile_flags]) - link_flags_str = " ".join([pipes.quote(f) for f in self.cxx.link_flags]) + flags_str = " ".join([shlex.quote(f) for f in self.cxx.flags]) + compile_flags_str = " ".join([shlex.quote(f) for f in self.cxx.compile_flags]) + link_flags_str = " ".join([shlex.quote(f) for f in self.cxx.link_flags]) all_flags = "%s %s %s" % (flags_str, compile_flags_str, link_flags_str) sub.append(("%flags", flags_str)) sub.append(("%compile_flags", compile_flags_str)) @@ -1552,7 +1547,7 @@ def configure_substitutions(self): sub.append(("%run", "%t.exe")) # Configure not program substitutions not_py = os.path.join(self.libcudacxx_src_root, "test", "utils", "not.py") - not_str = "%s %s " % (pipes.quote(sys.executable), pipes.quote(not_py)) + not_str = "%s %s " % (shlex.quote(sys.executable), shlex.quote(not_py)) sub.append(("not ", not_str)) if self.get_lit_conf("libcudacxx_gdb"): sub.append(("%libcxx_gdb", self.get_lit_conf("libcudacxx_gdb"))) diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore new file mode 100644 index 00000000000..24ec757199f --- /dev/null +++ b/python/cuda_cccl/.gitignore @@ -0,0 +1,2 @@ +cuda/cccl/include +*egg-info diff --git a/python/cuda_cccl/README.md b/python/cuda_cccl/README.md new file mode 100644 index 00000000000..37f020b6df6 --- /dev/null +++ b/python/cuda_cccl/README.md @@ -0,0 +1,3 @@ +## Note + +This package is currently FOR INTERNAL USE ONLY and not meant to be used/installed explicitly. diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py new file mode 100644 index 00000000000..5288f071942 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from cuda.cccl._version import __version__ +from cuda.cccl.include_paths import get_include_paths + +__all__ = ["__version__", "get_include_paths"] diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py new file mode 100644 index 00000000000..ec7c29a266e --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/_version.py @@ -0,0 +1,7 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# This file is generated by ci/update_version.sh +# Do not edit this file manually. +__version__ = "2.8.0" diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py new file mode 100644 index 00000000000..da8246b9195 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/include_paths.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +import shutil +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Optional + + +def _get_cuda_path() -> Optional[Path]: + cuda_path = os.environ.get("CUDA_PATH") + if cuda_path: + cuda_path = Path(cuda_path) + if cuda_path.exists(): + return cuda_path + + nvcc_path = shutil.which("nvcc") + if nvcc_path: + return Path(nvcc_path).parent.parent + + default_path = Path("/usr/local/cuda") + if default_path.exists(): + return default_path + + return None + + +@dataclass +class IncludePaths: + cuda: Optional[Path] + libcudacxx: Optional[Path] + cub: Optional[Path] + thrust: Optional[Path] + + def as_tuple(self): + # Note: higher-level ... lower-level order: + return (self.thrust, self.cub, self.libcudacxx, self.cuda) + + +@lru_cache() +def get_include_paths() -> IncludePaths: + # TODO: once docs env supports Python >= 3.9, we + # can move this to a module-level import. + from importlib.resources import as_file, files + + cuda_incl = None + cuda_path = _get_cuda_path() + if cuda_path is not None: + cuda_incl = cuda_path / "include" + + with as_file(files("cuda.cccl.include")) as f: + cccl_incl = Path(f) + assert cccl_incl.exists() + + return IncludePaths( + cuda=cuda_incl, + libcudacxx=cccl_incl / "libcudacxx", + cub=cccl_incl, + thrust=cccl_incl, + ) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml new file mode 100644 index 00000000000..ada06301a4c --- /dev/null +++ b/python/cuda_cccl/pyproject.toml @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +[build-system] +requires = ["setuptools>=61.0.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "cuda-cccl" +description = "Experimental Package with CCCL headers to support JIT compilation" +authors = [{ name = "NVIDIA Corporation" }] +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Environment :: GPU :: NVIDIA CUDA", + "License :: OSI Approved :: Apache Software License", +] +requires-python = ">=3.9" +dynamic = ["version", "readme"] + +[project.urls] +Homepage = "https://github.com/NVIDIA/cccl" + +[tool.setuptools.dynamic] +version = { attr = "cuda.cccl._version.__version__" } +readme = { file = ["README.md"], content-type = "text/markdown" } + +[tool.setuptools.package-data] +cuda = ["cccl/include/**/*"] diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py new file mode 100644 index 00000000000..f6e5e3fa033 --- /dev/null +++ b/python/cuda_cccl/setup.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import shutil +from pathlib import Path + +from setuptools import setup +from setuptools.command.build_py import build_py + +PROJECT_PATH = Path(__file__).resolve().parent +CCCL_PATH = PROJECT_PATH.parents[1] + + +class CustomBuildPy(build_py): + """Copy CCCL headers BEFORE super().run() + + Note that the CCCL headers cannot be referenced directly: + setuptools (and pyproject.toml) does not support relative paths that + reference files outside the package directory (like ../../). + This is a restriction designed to avoid inadvertently packaging files + that are outside the source tree. + """ + + def run(self): + cccl_headers = [ + ("cub", "cub"), + ("libcudacxx", "include"), + ("thrust", "thrust"), + ] + + inc_path = PROJECT_PATH / "cuda" / "cccl" / "include" + inc_path.mkdir(parents=True, exist_ok=True) + + for proj_dir, header_dir in cccl_headers: + src_path = CCCL_PATH / proj_dir / header_dir + dst_path = inc_path / proj_dir + if dst_path.exists(): + shutil.rmtree(dst_path) + shutil.copytree(src_path, dst_path) + + init_py_path = inc_path / "__init__.py" + init_py_path.write_text("# Intentionally empty.\n") + + super().run() + + +setup( + license_files=["../../LICENSE"], + cmdclass={"build_py": CustomBuildPy}, +) diff --git a/python/cuda_cooperative/.gitignore b/python/cuda_cooperative/.gitignore index 15c09b246c1..a9904c10554 100644 --- a/python/cuda_cooperative/.gitignore +++ b/python/cuda_cooperative/.gitignore @@ -1,3 +1,2 @@ -cuda/_include env *egg-info diff --git a/python/cuda_cooperative/MANIFEST.in b/python/cuda_cooperative/MANIFEST.in deleted file mode 100644 index 848cbfe2e81..00000000000 --- a/python/cuda_cooperative/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include cuda/_include * diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md index c202d1d6c17..673e130bbe0 100644 --- a/python/cuda_cooperative/README.md +++ b/python/cuda_cooperative/README.md @@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html. ## Local development ```bash +pip3 install -e ../cuda_cccl pip3 install -e .[test] pytest -v ./tests/ ``` diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py index 1e86dd45dfe..a1d269fd987 100644 --- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py +++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py @@ -3,9 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import functools -import importlib.resources as pkg_resources -import os -import shutil from cuda.bindings import nvrtc from cuda.cooperative.experimental._caching import disk_cache @@ -20,22 +17,6 @@ def CHECK_NVRTC(err, prog): raise RuntimeError(f"NVRTC error: {log.decode('ascii')}") -def get_cuda_path(): - cuda_path = os.environ.get("CUDA_PATH", "") - if os.path.exists(cuda_path): - return cuda_path - - nvcc_path = shutil.which("nvcc") - if nvcc_path is not None: - return os.path.dirname(os.path.dirname(nvcc_path)) - - default_path = "/usr/local/cuda" - if os.path.exists(default_path): - return default_path - - return None - - # cpp is the C++ source code # cc = 800 for Ampere, 900 Hopper, etc # rdc is true or false @@ -47,24 +28,15 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version): check_in("rdc", rdc, [True, False]) check_in("code", code, ["lto", "ptx"]) - with pkg_resources.path("cuda", "_include") as include_path: - # Using `.parent` for compatibility with pip install --editable: - include_path = pkg_resources.files("cuda.cooperative").parent.joinpath( - "_include" - ) - cub_path = include_path - thrust_path = include_path - libcudacxx_path = os.path.join(include_path, "libcudacxx") - cuda_include_path = os.path.join(get_cuda_path(), "include") - - opts = [ - b"--std=c++17", - bytes(f"--include-path={cub_path}", encoding="ascii"), - bytes(f"--include-path={thrust_path}", encoding="ascii"), - bytes(f"--include-path={libcudacxx_path}", encoding="ascii"), - bytes(f"--include-path={cuda_include_path}", encoding="ascii"), - bytes(f"--gpu-architecture=compute_{cc}", encoding="ascii"), - ] + opts = [b"--std=c++17"] + + # TODO: move this to a module-level import (after docs env modernization). + from cuda.cccl import get_include_paths + + for path in get_include_paths().as_tuple(): + if path is not None: + opts += [f"--include-path={path}".encode("ascii")] + opts += [f"--gpu-architecture=compute_{cc}".encode("ascii")] if rdc: opts += [b"--relocatable-device-code=true"] diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml index 017c0be1e56..788e1e6d5d8 100644 --- a/python/cuda_cooperative/pyproject.toml +++ b/python/cuda_cooperative/pyproject.toml @@ -1,11 +1,41 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception [build-system] -requires = ["packaging", "setuptools>=61.0.0", "wheel"] +requires = ["setuptools>=61.0.0"] build-backend = "setuptools.build_meta" +[project] +name = "cuda-cooperative" +description = "Experimental Core Library for CUDA Python" +authors = [{ name = "NVIDIA Corporation" }] +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Environment :: GPU :: NVIDIA CUDA", + "License :: OSI Approved :: Apache Software License", +] +requires-python = ">=3.9" +dependencies = [ + "cuda-cccl", + "numpy", + "numba>=0.60.0", + "pynvjitlink-cu12>=0.2.4", + "cuda-python==12.*", + "jinja2", +] +dynamic = ["version", "readme"] + +[project.optional-dependencies] +test = ["pytest", "pytest-xdist"] + +[project.urls] +Homepage = "https://developer.nvidia.com/" + +[tool.setuptools.dynamic] +version = { attr = "cuda.cooperative._version.__version__" } +readme = { file = ["README.md"], content-type = "text/markdown" } + [tool.ruff] extend = "../../pyproject.toml" diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py index 5f954086cfe..b8dd6502515 100644 --- a/python/cuda_cooperative/setup.py +++ b/python/cuda_cooperative/setup.py @@ -1,91 +1,9 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import os -import shutil - -from setuptools import Command, find_namespace_packages, setup -from setuptools.command.build_py import build_py -from wheel.bdist_wheel import bdist_wheel - -project_path = os.path.abspath(os.path.dirname(__file__)) -cccl_path = os.path.abspath(os.path.join(project_path, "..", "..")) -cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]] -__version__ = None -with open(os.path.join(project_path, "cuda", "cooperative", "_version.py")) as f: - exec(f.read()) -assert __version__ is not None -ver = __version__ -del __version__ - - -with open("README.md") as f: - long_description = f.read() - - -class CustomBuildCommand(build_py): - def run(self): - self.run_command("package_cccl") - build_py.run(self) - - -class CustomWheelBuild(bdist_wheel): - def run(self): - self.run_command("package_cccl") - super().run() - - -class PackageCCCLCommand(Command): - description = "Generate additional files" - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - for proj_dir, header_dir in cccl_headers: - src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir)) - dst_path = os.path.join(project_path, "cuda", "_include", proj_dir) - if os.path.exists(dst_path): - shutil.rmtree(dst_path) - shutil.copytree(src_path, dst_path) - +from setuptools import setup setup( - name="cuda-cooperative", - version=ver, - description="Experimental Core Library for CUDA Python", - long_description=long_description, - long_description_content_type="text/markdown", - author="NVIDIA Corporation", - classifiers=[ - "Programming Language :: Python :: 3 :: Only", - "Environment :: GPU :: NVIDIA CUDA", - ], - packages=find_namespace_packages(include=["cuda.*"]), - python_requires=">=3.9", - install_requires=[ - "numba>=0.60.0", - "pynvjitlink-cu12>=0.2.4", - "cuda-python", - "jinja2", - ], - extras_require={ - "test": [ - "pytest", - "pytest-xdist", - ] - }, - cmdclass={ - "package_cccl": PackageCCCLCommand, - "build_py": CustomBuildCommand, - "bdist_wheel": CustomWheelBuild, - }, - include_package_data=True, - license="Apache-2.0 with LLVM exception", - license_files=("../../LICENSE",), + license_files=["../../LICENSE"], ) diff --git a/python/cuda_parallel/.gitignore b/python/cuda_parallel/.gitignore index 8e0d030ff6a..7fc9da1604e 100644 --- a/python/cuda_parallel/.gitignore +++ b/python/cuda_parallel/.gitignore @@ -1,4 +1,3 @@ -cuda/_include env *egg-info *so diff --git a/python/cuda_parallel/MANIFEST.in b/python/cuda_parallel/MANIFEST.in deleted file mode 100644 index 848cbfe2e81..00000000000 --- a/python/cuda_parallel/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include cuda/_include * diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md index 98a3a3c92d0..1dad4b0f03e 100644 --- a/python/cuda_parallel/README.md +++ b/python/cuda_parallel/README.md @@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html. ## Local development ```bash +pip3 install -e ../cuda_cccl pip3 install -e .[test] pytest -v ./tests/ ``` diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py index c19ceebbf3e..ffc35ee2a87 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py @@ -4,28 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import ctypes -import os -import shutil from functools import lru_cache -from typing import List, Optional +from typing import List -from . import _cccl as cccl - - -def _get_cuda_path() -> Optional[str]: - cuda_path = os.environ.get("CUDA_PATH", "") - if os.path.exists(cuda_path): - return cuda_path - - nvcc_path = shutil.which("nvcc") - if nvcc_path is not None: - return os.path.dirname(os.path.dirname(nvcc_path)) - - default_path = "/usr/local/cuda" - if os.path.exists(default_path): - return default_path +from cuda.cccl import get_include_paths # type: ignore[import-not-found] - return None +from . import _cccl as cccl @lru_cache() @@ -55,27 +39,9 @@ def get_bindings() -> ctypes.CDLL: @lru_cache() def get_paths() -> List[bytes]: - # TODO: once docs env supports Python >= 3.9, we - # can move this to a module-level import. - from importlib.resources import as_file, files - - with as_file(files("cuda.parallel")) as f: - # Using `.parent` for compatibility with pip install --editable: - cub_include_path = str(f.parent / "_include") - thrust_include_path = cub_include_path - libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx")) - cuda_include_path = None - cuda_path = _get_cuda_path() - if cuda_path is not None: - cuda_include_path = str(os.path.join(cuda_path, "include")) paths = [ f"-I{path}".encode() - for path in ( - cub_include_path, - thrust_include_path, - libcudacxx_include_path, - cuda_include_path, - ) + for path in get_include_paths().as_tuple() if path is not None ] return paths diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index b99b5c4c9e1..41c0a3449e2 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -55,6 +55,9 @@ def __init__( op: Callable, h_init: np.ndarray, ): + # Referenced from __del__: + self.build_result = None + d_in_cccl = cccl.to_cccl_iter(d_in) self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name( d_in_cccl.value_type.type.value @@ -128,6 +131,8 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray return temp_storage_bytes.value def __del__(self): + if self.build_result is None: + return bindings = get_bindings() bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result)) diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml index c73736e496a..e7d2b9f0081 100644 --- a/python/cuda_parallel/pyproject.toml +++ b/python/cuda_parallel/pyproject.toml @@ -1,19 +1,39 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception [build-system] -requires = ["packaging", "setuptools>=61.0.0", "wheel"] +requires = ["setuptools>=61.0.0"] build-backend = "setuptools.build_meta" +[project] +name = "cuda-parallel" +description = "Experimental Core Library for CUDA Python" +authors = [{ name = "NVIDIA Corporation" }] +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Environment :: GPU :: NVIDIA CUDA", + "License :: OSI Approved :: Apache Software License", +] +requires-python = ">=3.9" +dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python==12.*"] +dynamic = ["version", "readme"] + +[project.optional-dependencies] +test = ["pytest", "pytest-xdist", "cupy-cuda12x", "typing_extensions"] + +[project.urls] +Homepage = "https://developer.nvidia.com/" + +[tool.setuptools.dynamic] +version = { attr = "cuda.parallel._version.__version__" } +readme = { file = ["README.md"], content-type = "text/markdown" } + [tool.mypy] python_version = "3.10" [[tool.mypy.overrides]] -module = [ - "numba.*", - "llvmlite" -] +module = ["numba.*", "llvmlite"] ignore_missing_imports = true follow_imports = "skip" diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index bb7cbb3ac44..c5c9fcd3c32 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -1,61 +1,15 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import os -import shutil import subprocess +from pathlib import Path -from setuptools import Command, Extension, find_namespace_packages, setup +from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -from setuptools.command.build_py import build_py -from wheel.bdist_wheel import bdist_wheel -project_path = os.path.abspath(os.path.dirname(__file__)) -cccl_path = os.path.abspath(os.path.join(project_path, "..", "..")) -cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]] -__version__ = None -with open(os.path.join(project_path, "cuda", "parallel", "_version.py")) as f: - exec(f.read()) -assert __version__ is not None -ver = __version__ -del __version__ - - -with open("README.md") as f: - long_description = f.read() - - -class CustomBuildCommand(build_py): - def run(self): - self.run_command("package_cccl") - build_py.run(self) - - -class CustomWheelBuild(bdist_wheel): - def run(self): - self.run_command("package_cccl") - super().run() - - -class PackageCCCLCommand(Command): - description = "Generate additional files" - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - for proj_dir, header_dir in cccl_headers: - src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir)) - # TODO Extract cccl headers into a standalone package - dst_path = os.path.join(project_path, "cuda", "_include", proj_dir) - if os.path.exists(dst_path): - shutil.rmtree(dst_path) - shutil.copytree(src_path, dst_path) +CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1] +CCCL_PATH = CCCL_PYTHON_PATH.parent class CMakeExtension(Extension): @@ -69,53 +23,27 @@ def run(self): self.build_extension(ext) def build_extension(self, ext): - extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + extdir = Path(self.get_ext_fullpath(ext.name)).resolve().parent cmake_args = [ "-DCCCL_ENABLE_C=YES", - "-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=" + extdir, + f"-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY={extdir}", "-DCMAKE_BUILD_TYPE=Release", ] - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) + build_temp_path = Path(self.build_temp) + build_temp_path.mkdir(parents=True, exist_ok=True) - subprocess.check_call(["cmake", cccl_path] + cmake_args, cwd=self.build_temp) + subprocess.check_call(["cmake", CCCL_PATH] + cmake_args, cwd=build_temp_path) subprocess.check_call( ["cmake", "--build", ".", "--target", "cccl.c.parallel"], - cwd=self.build_temp, + cwd=build_temp_path, ) setup( - name="cuda-parallel", - version=ver, - description="Experimental Core Library for CUDA Python", - long_description=long_description, - long_description_content_type="text/markdown", - author="NVIDIA Corporation", - classifiers=[ - "Programming Language :: Python :: 3 :: Only", - "Environment :: GPU :: NVIDIA CUDA", - ], - packages=find_namespace_packages(include=["cuda.*"]), - python_requires=">=3.9", - # TODO: typing_extensions required for Python 3.7 docs env - install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"], - extras_require={ - "test": [ - "pytest", - "pytest-xdist", - "cupy-cuda12x", - ] - }, + license_files=["../../LICENSE"], cmdclass={ - "package_cccl": PackageCCCLCommand, - "build_py": CustomBuildCommand, - "bdist_wheel": CustomWheelBuild, "build_ext": BuildCMakeExtension, }, ext_modules=[CMakeExtension("cuda.parallel.experimental.cccl.c")], - include_package_data=True, - license="Apache-2.0 with LLVM exception", - license_files=("../../LICENSE",), ) From 2b5ed0b2fb10958f820b3c397b4dae008eb292ac Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 30 Jan 2025 03:17:12 -0800 Subject: [PATCH 3/8] work around msvc bug exposed by `__type_index` in `type_list.h` (#3487) (#3537) simplify the fall-back implementation of `__type_index` to avoid causing MSVC to ICE. Co-authored-by: Eric Niebler Co-authored-by: Michael Schellenberger Costa --- .../cuda/std/__type_traits/type_list.h | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h index 66652922ceb..461b9fc8fbe 100644 --- a/libcudacxx/include/cuda/std/__type_traits/type_list.h +++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h @@ -473,29 +473,33 @@ using __type_index = __type_call<__detail::__type_index_fn<_Ip::value>, _Ts...>; namespace __detail { -template -struct __inherit_flat : _Ts... -{}; - template -struct __type_index_leaf +struct __type_tuple_elem { - using type = _Ty; + using type _CCCL_NODEBUG_ALIAS = _Ty; }; +template +struct __type_tupl; + +template +struct __type_tupl, _Ts...> : __type_tuple_elem<_Is, _Ts>... +{}; + +template +using __type_tuple = __type_tupl, _Ts...>; + template -_LIBCUDACXX_HIDE_FROM_ABI __type_index_leaf<_Ip, _Ty> __type_index_get(__type_index_leaf<_Ip, _Ty>*); +_LIBCUDACXX_HIDE_FROM_ABI __type_tuple_elem<_Ip, _Ty> __type_tuple_get(__type_tuple_elem<_Ip, _Ty>); -template -struct __type_index_large_size_fn; +template +using __type_tuple_element_t _CCCL_NODEBUG_ALIAS = + __type(__type_tuple<_Ts...>{}))>; -template -struct __type_index_large_size_fn> +struct __type_index_large_size_fn { template - using __call _CCCL_NODEBUG_ALIAS = // - __type( - static_cast<__inherit_flat<__type_index_leaf<_Is, _Ts>...>*>(nullptr)))>; + using __call _CCCL_NODEBUG_ALIAS = __type_tuple_element_t<_Ip::value, _Ts...>; }; template @@ -516,12 +520,8 @@ _CCCL_PP_REPEAT_REVERSE(_CCCL_META_UNROLL_LIMIT, _M1) # undef _M1 template -struct __type_index_select_fn // Default for larger indices -{ - template - using __call _CCCL_NODEBUG_ALIAS = - __type_call<__type_index_large_size_fn>, _Ip, _Ts...>; -}; +struct __type_index_select_fn : __type_index_large_size_fn // Default for larger indices +{}; template <> struct __type_index_select_fn // Fast implementation for smaller indices @@ -531,9 +531,19 @@ struct __type_index_select_fn // Fast implementation for smaller indices }; } // namespace __detail +# if !_CCCL_COMPILER(MSVC) + template using __type_index = __type_call<__detail::__type_index_select_fn<(_Ip::value < _CCCL_META_UNROLL_LIMIT)>, _Ip, _Ts...>; +# else // ^^^ !_CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(MSVC) vvv + +// Simplify the implementation for MSVC, which has trouble with the above +template +using __type_index = __detail::__type_index_large_size_fn::__call<_Ip, _Ts...>; + +# endif // !_CCCL_COMPILER(MSVC) + template using __type_index_c = __type_index, _Ts...>; From 863b25f5e2a37db7ee50aa31c7d15e95d7ef2228 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 30 Jan 2025 14:44:50 +0100 Subject: [PATCH 4/8] Update CI matrix to use NVKS nodes. (#3572) (#3605) * Update CI matrix to use NVKS nodes. * Update windows CI scripts to accept -arch. * Move all non-Catch2 device algo tests to lid0/lid1. This makes sure that they run in the correct CI config on appropriate hardware. * Switch to all rtx queues: CUB -> RTXA6000 (48GiB) Thrust -> RTX4090 (24GiB) Others -> RTX2080 (8GiB) Co-authored-by: Allison Piper --- ci/matrix.yaml | 63 ++++++++++++++------------------- ci/windows/build_common.psm1 | 15 ++++++-- ci/windows/build_cub.ps1 | 8 +++-- ci/windows/build_cudax.ps1 | 8 +++-- ci/windows/build_libcudacxx.ps1 | 8 +++-- ci/windows/build_thrust.ps1 | 8 +++-- ci/windows/test_thrust.ps1 | 8 +++-- cub/test/CMakeLists.txt | 9 +++++ 8 files changed, 78 insertions(+), 49 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index b04923fad9a..7f1f86a0da9 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -19,26 +19,28 @@ workflows: - {jobs: ['build'], std: 'max', cxx: ['intel', 'msvc2019']} - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']} + - {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} # Disabled until we figure out the issue with the TBB dll - #- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']} + #- {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'} # Split up cub tests: - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']} - - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } # Modded builds: - {jobs: ['build'], std: [17, 20], ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'} # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} # default_projects: clang-cuda - {jobs: ['build'], std: [17, 20], cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # nvrtc: - - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'} + - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'} # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: @@ -52,19 +54,19 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: [17, 20], cxx: ['gcc', 'clang'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc']} + - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'} # Python and c/parallel jobs: - - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} + - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'} # cccl-infra: - - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']} - - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} - - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} + - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9'], gpu: 'rtx2080'} + - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'} + - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} nightly: # Edge-case jobs - - {jobs: ['limited'], project: 'cub', std: 17} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} + - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'} # Old CTK - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']} - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'} @@ -73,7 +75,11 @@ workflows: - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']} # Test current CTK - - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} + - {jobs: ['test'], project: 'cub', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'v100'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test'], project: 'thrust', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'} # Modded builds: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} @@ -92,26 +98,9 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['gcc12'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} - -# # These are waiting on the NVKS nodes: -# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]} -# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]} -# - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]} -# # H100 runners are currently flakey, only build since those use CPU-only runners: -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} -# -# # nvrtc: -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12'] , gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080'} # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1 index 1eb5f1a9d63..151bb1f112e 100644 --- a/ci/windows/build_common.psm1 +++ b/ci/windows/build_common.psm1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -20,6 +24,12 @@ if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") { Write-Host "Detected cl.exe version: $CL_VERSION" } +$script:GLOBAL_CMAKE_OPTIONS = "" +if ($CUDA_ARCH -ne 0) { + $script:GLOBAL_CMAKE_OPTIONS += "-DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH" +} + + if (-not $env:CCCL_BUILD_INFIX) { $env:CCCL_BUILD_INFIX = "" } @@ -56,6 +66,7 @@ Write-Host "NVCC_VERSION=$NVCC_VERSION" Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL" Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL" Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX" +Write-Host "GLOBAL_CMAKE_OPTIONS=$script:GLOBAL_CMAKE_OPTIONS" Write-Host "Current commit is:" Write-Host "$(git log -1 --format=short)" Write-Host "========================================" @@ -82,7 +93,7 @@ function configure_preset { pushd ".." # Echo and execute command to stdout: - $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE" + $configure_command = "cmake --preset $PRESET $script:GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS --log-level VERBOSE" Write-Host $configure_command Invoke-Expression $configure_command $test_result = $LastExitCode diff --git a/ci/windows/build_cub.ps1 b/ci/windows/build_cub.ps1 index 32e4f71ee9a..27c5360ded9 100644 --- a/ci/windows/build_cub.ps1 +++ b/ci/windows/build_cub.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cub-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_cudax.ps1 b/ci/windows/build_cudax.ps1 index ca7bd578291..7b8cd0ff771 100644 --- a/ci/windows/build_cudax.ps1 +++ b/ci/windows/build_cudax.ps1 @@ -4,7 +4,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(20)] - [int]$CXX_STANDARD = 20 + [int]$CXX_STANDARD = 20, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $CURRENT_PATH = Split-Path $pwd -leaf @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { } Remove-Module -Name build_common -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cudax-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_libcudacxx.ps1 b/ci/windows/build_libcudacxx.ps1 index a57e2280de7..2f80619f76b 100644 --- a/ci/windows/build_libcudacxx.ps1 +++ b/ci/windows/build_libcudacxx.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "libcudacxx-cpp${CXX_STANDARD}" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_thrust.ps1 b/ci/windows/build_thrust.ps1 index 186ed94eace..bda86859fd4 100644 --- a/ci/windows/build_thrust.ps1 +++ b/ci/windows/build_thrust.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/test_thrust.ps1 b/ci/windows/test_thrust.ps1 index 7c020714208..eabda06df5b 100644 --- a/ci/windows/test_thrust.ps1 +++ b/ci/windows/test_thrust.ps1 @@ -5,6 +5,10 @@ Param( [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17, [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0, + [Parameter(Mandatory = $false)] [Alias("cpu-only")] [switch]$CPU_ONLY = $false ) @@ -24,11 +28,11 @@ If($CURRENT_PATH -ne "ci") { } # Execute the build script: -$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD" +$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD -arch $CUDA_ARCH" Write-Host "Executing: $build_command" Invoke-Expression $build_command -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpu-cpp$CXX_STANDARD" diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index c86d24754de..80b382c5a46 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -370,6 +370,15 @@ foreach (test_src IN LISTS test_srcs) set(launcher 0) endif() + # FIXME: There are a few remaining device algorithm tests that have not been ported to + # use Catch2 and lid variants. Mark these as `lid_0/1` so they'll run in the appropriate + # CI configs: + string(REGEX MATCH "^device_" is_device_test "${test_name}") + _cub_is_fail_test(is_fail_test "%{test_name}") + if (is_device_test AND NOT is_fail_test) + string(APPEND test_name ".lid_${launcher}") + endif() + # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher}) cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC}) From 3a594e30260dd3cf235b4274590f48dfcdfdbdb7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 16:33:47 +0100 Subject: [PATCH 5/8] Specialize `cuda::std::numeric_limits` for FP8 types (#3478) (#3492) Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> --- libcudacxx/include/cuda/std/limits | 192 ++++++++++++++++++ .../limits/numeric.limits.members/common.h | 27 +++ .../const_data_members.pass.cpp | 4 + .../denorm_min.pass.cpp | 4 + .../numeric.limits.members/digits.pass.cpp | 5 + .../numeric.limits.members/digits10.pass.cpp | 67 +++--- .../numeric.limits.members/epsilon.pass.cpp | 4 + .../has_denorm.pass.cpp | 4 + .../has_denorm_loss.pass.cpp | 4 + .../has_infinity.pass.cpp | 4 + .../has_quiet_NaN.pass.cpp | 4 + .../has_signaling_NaN.pass.cpp | 4 + .../numeric.limits.members/infinity.pass.cpp | 8 + .../is_bounded.pass.cpp | 4 + .../numeric.limits.members/is_exact.pass.cpp | 4 + .../numeric.limits.members/is_iec559.pass.cpp | 4 + .../is_integer.pass.cpp | 4 + .../numeric.limits.members/is_modulo.pass.cpp | 4 + .../numeric.limits.members/is_signed.pass.cpp | 4 + .../numeric.limits.members/lowest.pass.cpp | 4 + .../numeric.limits.members/max.pass.cpp | 4 + .../max_digits10.pass.cpp | 66 +++--- .../max_exponent.pass.cpp | 49 +++-- .../max_exponent10.pass.cpp | 49 +++-- .../numeric.limits.members/min.pass.cpp | 4 + .../min_exponent.pass.cpp | 49 +++-- .../min_exponent10.pass.cpp | 49 +++-- .../numeric.limits.members/quiet_NaN.pass.cpp | 64 ++++-- .../numeric.limits.members/radix.pass.cpp | 4 + .../round_error.pass.cpp | 4 + .../round_style.pass.cpp | 4 + .../signaling_NaN.pass.cpp | 64 ++++-- .../tinyness_before.pass.cpp | 4 + .../numeric.limits.members/traps.pass.cpp | 4 + 34 files changed, 627 insertions(+), 150 deletions(-) diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits index 5e4df32270e..2bd0191af43 100644 --- a/libcudacxx/include/cuda/std/limits +++ b/libcudacxx/include/cuda/std/limits @@ -29,6 +29,21 @@ #include #include +#if defined(_LIBCUDACXX_HAS_NVFP16) +# include +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +#endif // _LIBCUDACXX_HAS_NVBF16 + +#if _CCCL_HAS_NVFP8() +# include +#endif // _CCCL_HAS_NVFP8() + _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -744,6 +759,183 @@ public: }; #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() +# if defined(_CCCL_BUILTIN_BIT_CAST) || _CCCL_STD_VER >= 2014 +# define _LIBCUDACXX_CONSTEXPR_FP8_LIMITS constexpr +# else // ^^^ _CCCL_BUILTIN_BIT_CAST || _CCCL_STD_VER >= 2014 ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST && _CCCL_STD_VER < + // 2014 vvv +# define _LIBCUDACXX_CONSTEXPR_FP8_LIMITS +# endif // ^^^ !_CCCL_BUILTIN_BIT_CAST && _CCCL_STD_VER < 2014 ^^^ + +template <> +class __numeric_limits_impl<__nv_fp8_e4m3, __numeric_limits_type::__floating_point> +{ + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS __nv_fp8_e4m3 __make_value(__nv_fp8_storage_t __val) + { +# if defined(_CCCL_BUILTIN_BIT_CAST) + return _CUDA_VSTD::bit_cast<__nv_fp8_e4m3>(__val); +# else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST vvv + __nv_fp8_e4m3 __ret{}; + __ret.__x = __val; + return __ret; +# endif // ^^^ !_CCCL_BUILTIN_BIT_CAST ^^^ + } + +public: + using type = __nv_fp8_e4m3; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 3; + static constexpr int digits10 = 0; + static constexpr int max_digits10 = 2; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type min() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x08u)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type max() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7eu)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type lowest() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0xfeu)); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type epsilon() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x20u)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type round_error() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x30u)); + } + + static constexpr int min_exponent = -6; + static constexpr int min_exponent10 = -2; + static constexpr int max_exponent = 8; + static constexpr int max_exponent10 = 2; + + static constexpr bool has_infinity = false; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = false; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type infinity() noexcept + { + return type{}; + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type quiet_NaN() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7fu)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type signaling_NaN() noexcept + { + return type{}; + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type denorm_min() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x01u)); + } + + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; + +template <> +class __numeric_limits_impl<__nv_fp8_e5m2, __numeric_limits_type::__floating_point> +{ + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS __nv_fp8_e5m2 __make_value(__nv_fp8_storage_t __val) + { +# if defined(_CCCL_BUILTIN_BIT_CAST) + return _CUDA_VSTD::bit_cast<__nv_fp8_e5m2>(__val); +# else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST vvv + __nv_fp8_e5m2 __ret{}; + __ret.__x = __val; + return __ret; +# endif // ^^^ !_CCCL_BUILTIN_BIT_CAST ^^^ + } + +public: + using type = __nv_fp8_e5m2; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 2; + static constexpr int digits10 = 0; + static constexpr int max_digits10 = 2; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type min() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x04u)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type max() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7bu)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type lowest() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0xfbu)); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type epsilon() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x34u)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type round_error() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x38u)); + } + + static constexpr int min_exponent = -15; + static constexpr int min_exponent10 = -5; + static constexpr int max_exponent = 15; + static constexpr int max_exponent10 = 4; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type infinity() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7cu)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type quiet_NaN() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7eu)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type signaling_NaN() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x7du)); + } + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type denorm_min() noexcept + { + return __make_value(static_cast<__nv_fp8_storage_t>(0x01u)); + } + + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +#endif // _CCCL_HAS_NVFP8() + template class numeric_limits : public __numeric_limits_impl<_Tp> {}; diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h index 15b48836839..8400071611c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h @@ -11,6 +11,7 @@ #define NUMERIC_LIMITS_MEMBERS_COMMON_H // Disable all the extended floating point operations and conversions +#define __CUDA_NO_FP8_CONVERSIONS__ 1 #define __CUDA_NO_HALF_CONVERSIONS__ 1 #define __CUDA_NO_HALF_OPERATORS__ 1 #define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1 @@ -24,6 +25,32 @@ __host__ __device__ bool float_eq(T x, T y) return x == y; } +#if _CCCL_HAS_NVFP8() +__host__ __device__ inline __nv_fp8_e4m3 make_fp8_e4m3(double x, __nv_saturation_t sat = __NV_NOSAT) +{ + __nv_fp8_e4m3 res; + res.__x = __nv_cvt_double_to_fp8(x, sat, __NV_E4M3); + return res; +} + +__host__ __device__ inline __nv_fp8_e5m2 make_fp8_e5m2(double x, __nv_saturation_t sat = __NV_NOSAT) +{ + __nv_fp8_e5m2 res; + res.__x = __nv_cvt_double_to_fp8(x, sat, __NV_E5M2); + return res; +} + +__host__ __device__ inline bool float_eq(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y) +{ + return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E4M3)}); +} + +__host__ __device__ inline bool float_eq(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y) +{ + return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E5M2)}); +} +#endif // _CCCL_HAS_NVFP8 + #if defined(_LIBCUDACXX_HAS_NVFP16) __host__ __device__ inline bool float_eq(__half x, __half y) { diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp index 769080cff83..b095d63afcd 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp @@ -116,6 +116,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test_type<__nv_bfloat16>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test_type<__nv_fp8_e4m3>(); + test_type<__nv_fp8_e5m2>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp index cc64ed14686..475f41a3388 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp @@ -72,6 +72,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(0.001953125)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(0.0000152587890625)); +#endif // _CCCL_HAS_NVFP8() #if !defined(__FLT_DENORM_MIN__) && !defined(FLT_TRUE_MIN) # error Test has no expected values for floating point types #endif diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp index efce1ccf678..0d3c910b672 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp @@ -61,5 +61,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, 8>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, 3>(); + test<__nv_fp8_e5m2, 2>(); +#endif // _CCCL_HAS_NVFP8() + return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp index 32990ece4b1..bd66aeecfeb 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp @@ -12,10 +12,25 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_digits10() +{ + // digits * log10(2) + return static_cast((cuda::std::numeric_limits::digits * 30103l) / 100000l); +} + +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_digits10() +{ + // (digits - 1) * log10(2) + return static_cast(((cuda::std::numeric_limits::digits - 1) * 30103l) / 100000l); +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::digits10 == expected, "digits10 test 1"); @@ -30,41 +45,45 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); // 4 -> 9 and 2 -> 4 + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 38>(); - test<__uint128_t, 38>(); + test<__int128_t>(); + test<__uint128_t>(); #endif - test(); - test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE - test(); + test(); #endif #if defined(_LIBCUDACXX_HAS_NVFP16) - test<__half, 3>(); + test<__half>(); #endif // _LIBCUDACXX_HAS_NVFP16 #if defined(_LIBCUDACXX_HAS_NVBF16) - test<__nv_bfloat16, 2>(); + test<__nv_bfloat16>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(); + test<__nv_fp8_e5m2>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp index fa42c5e8fe6..15366bdf308 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp @@ -63,6 +63,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.0078125)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(0.125)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(0.25)); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp index 5a0a05ab73b..8fa506b93ce 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, cuda::std::denorm_present>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, cuda::std::denorm_present>(); + test<__nv_fp8_e5m2, cuda::std::denorm_present>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp index 450e51b8111..3b7722acd8b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp index 646f5e20160..ebddcb4421e 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, true>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp index 626b4110695..908f2d7fa4a 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, true>(); + test<__nv_fp8_e5m2, true>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp index 20cd04d107e..62d81c8a524 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, true>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp index 34527e300c5..627105a4a8c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp @@ -70,6 +70,10 @@ int main(int, char**) # if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0)); # endif // _LIBCUDACXX_HAS_NVBF16 +# if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); + test<__nv_fp8_e5m2>(make_fp8_e5m2(1.0 / 0.0)); +# endif // _CCCL_HAS_NVFP8() // MSVC has issues with producing INF with divisions by zero. #else test(INFINITY); @@ -83,6 +87,10 @@ int main(int, char**) # if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(INFINITY)); # endif // _LIBCUDACXX_HAS_NVBF16 +# if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); + test<__nv_fp8_e5m2>(make_fp8_e5m2(INFINITY)); +# endif // _CCCL_HAS_NVFP8() #endif return 0; diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp index 9e671c5d905..eeb9740e4e2 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, true>(); + test<__nv_fp8_e5m2, true>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp index cfc9a6cab90..c3c2e027c72 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp index 945347ff4b5..7bab40e8826 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp index 65dd98fdb04..68e7437f1e0 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp index 6d82269e1c8..992be2b18b7 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp index eb39869bf24..be7e4f235a7 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, true>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, true>(); + test<__nv_fp8_e5m2, true>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp index e3b832dfd9b..6a8b2a9c181 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp @@ -72,6 +72,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(-448.0)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(-57344.0)); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp index 7ba6dabb1d2..a1582e41b22 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp @@ -71,6 +71,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(448.0)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(57344.0)); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp index 92b3d13ea61..d01a4aa099c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp @@ -12,10 +12,24 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_max_digits10() +{ + return 0; +} + +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_max_digits10() +{ + // std::ceil(std::numeric_limits::digits * std::log10(2) + 1) + return static_cast((cuda::std::numeric_limits::digits * 30103l + 99999l) / 100000l) + 1; +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::max_digits10 == expected, "max_digits10 test 1"); @@ -26,41 +40,45 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 0>(); - test<__uint128_t, 0>(); + test<__int128_t>(); + test<__uint128_t>(); #endif - test(); - test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE - test(); + test(); #endif #if defined(_LIBCUDACXX_HAS_NVFP16) - test<__half, 5>(); + test<__half>(); #endif // _LIBCUDACXX_HAS_NVFP16 #if defined(_LIBCUDACXX_HAS_NVBF16) - test<__nv_bfloat16, 4>(); + test<__nv_bfloat16>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(); + test<__nv_fp8_e5m2>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp index 81d5ae07795..3027e9f06f5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp @@ -12,10 +12,17 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_max_exponent() +{ + return 0; +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::max_exponent == expected, "max_exponent test 1"); @@ -26,29 +33,29 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 0>(); - test<__uint128_t, 0>(); + test<__int128_t>(); + test<__uint128_t>(); #endif test(); test(); @@ -61,6 +68,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, 128>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, 8>(); + test<__nv_fp8_e5m2, 15>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp index 4c426b37460..5924aee173d 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp @@ -12,10 +12,17 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_max_exponent10() +{ + return 0; +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::max_exponent10 == expected, "max_exponent10 test 1"); @@ -26,29 +33,29 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 0>(); - test<__uint128_t, 0>(); + test<__int128_t>(); + test<__uint128_t>(); #endif test(); test(); @@ -61,6 +68,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, 38>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, 2>(); + test<__nv_fp8_e5m2, 4>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp index c24c3fde869..15f470909df 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp @@ -72,6 +72,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(0.015625)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(0.000061035)); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp index e3150f8dc8e..b63d653a7c3 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp @@ -12,10 +12,17 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_min_exponent() +{ + return 0; +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::min_exponent == expected, "min_exponent test 1"); @@ -26,29 +33,29 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 0>(); - test<__uint128_t, 0>(); + test<__int128_t>(); + test<__uint128_t>(); #endif test(); test(); @@ -61,6 +68,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, -125>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, -6>(); + test<__nv_fp8_e5m2, -15>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp index cbca8e04171..a6ff20e7fde 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp @@ -12,10 +12,17 @@ #include #include +#include #include "test_macros.h" -template +template ::value, int> = 0> +__host__ __device__ constexpr int make_expected_min_exponent10() +{ + return 0; +} + +template ()> __host__ __device__ void test() { static_assert(cuda::std::numeric_limits::min_exponent10 == expected, "min_exponent10 test 1"); @@ -26,29 +33,29 @@ __host__ __device__ void test() int main(int, char**) { - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); #if TEST_STD_VER > 2017 && defined(__cpp_char8_t) - test(); + test(); #endif #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); + test(); + test(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS - test(); - test(); - test(); - test(); - test(); - test(); - test(); - test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); + test(); #ifndef _LIBCUDACXX_HAS_NO_INT128 - test<__int128_t, 0>(); - test<__uint128_t, 0>(); + test<__int128_t>(); + test<__uint128_t>(); #endif test(); test(); @@ -61,6 +68,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, -37>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, -2>(); + test<__nv_fp8_e5m2, -5>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp index 74e7f427941..2d6d9582f5c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp @@ -18,29 +18,63 @@ #include "test_macros.h" template -__host__ __device__ void test_imp(cuda::std::true_type) +__host__ __device__ bool is_nan(T x) { - assert(cuda::std::isnan(cuda::std::numeric_limits::quiet_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::quiet_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::quiet_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::quiet_NaN())); + return cuda::std::isnan(x); } +#if _CCCL_HAS_NVFP8() +__host__ __device__ bool is_nan(__nv_fp8_e4m3 x) +{ + return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}); +} + +__host__ __device__ bool is_nan(__nv_fp8_e5m2 x) +{ + return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}); +} +#endif // _CCCL_HAS_NVFP8() + template -__host__ __device__ void test_imp(cuda::std::false_type) +__host__ __device__ void test_impl(cuda::std::true_type) { - assert(cuda::std::numeric_limits::quiet_NaN() == T()); - assert(cuda::std::numeric_limits::quiet_NaN() == T()); - assert(cuda::std::numeric_limits::quiet_NaN() == T()); - assert(cuda::std::numeric_limits::quiet_NaN() == T()); + assert(is_nan(cuda::std::numeric_limits::quiet_NaN())); + assert(is_nan(cuda::std::numeric_limits::quiet_NaN())); + assert(is_nan(cuda::std::numeric_limits::quiet_NaN())); + assert(is_nan(cuda::std::numeric_limits::quiet_NaN())); } template -__host__ __device__ inline void test() +__host__ __device__ bool equal_to(T x, T y) { - constexpr bool is_float = cuda::std::is_floating_point::value || cuda::std::__is_extended_floating_point::value; + return x == y; +} + +#if _CCCL_HAS_NVFP8() +__host__ __device__ bool equal_to(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y) +{ + return x.__x == y.__x; +} - test_imp(cuda::std::integral_constant{}); +__host__ __device__ bool equal_to(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y) +{ + return x.__x == y.__x; +} +#endif // _CCCL_HAS_NVFP8() + +template +__host__ __device__ void test_impl(cuda::std::false_type) +{ + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); +} + +template +__host__ __device__ inline void test() +{ + test_impl(cuda::std::integral_constant::has_quiet_NaN>{}); } int main(int, char**) @@ -80,6 +114,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(); + test<__nv_fp8_e5m2>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp index 9765db6f760..7e5c87927aa 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp @@ -61,6 +61,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, FLT_RADIX>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, FLT_RADIX>(); + test<__nv_fp8_e5m2, FLT_RADIX>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp index ba5049fc49f..d4faf373a09 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp @@ -63,6 +63,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.5)); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(make_fp8_e4m3(0.5)); + test<__nv_fp8_e5m2>(make_fp8_e5m2(0.5)); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp index 3fb436381a7..8515581d650 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, cuda::std::round_to_nearest>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, cuda::std::round_to_nearest>(); + test<__nv_fp8_e5m2, cuda::std::round_to_nearest>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp index 69ba66038de..19ace1b3d2c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp @@ -18,29 +18,63 @@ #include "test_macros.h" template -__host__ __device__ void test_imp(cuda::std::true_type) +__host__ __device__ bool is_nan(T x) { - assert(cuda::std::isnan(cuda::std::numeric_limits::signaling_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::signaling_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::signaling_NaN())); - assert(cuda::std::isnan(cuda::std::numeric_limits::signaling_NaN())); + return cuda::std::isnan(x); } +#if _CCCL_HAS_NVFP8() +__host__ __device__ bool is_nan(__nv_fp8_e4m3 x) +{ + return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}); +} + +__host__ __device__ bool is_nan(__nv_fp8_e5m2 x) +{ + return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}); +} +#endif // _CCCL_HAS_NVFP8() + template -__host__ __device__ void test_imp(cuda::std::false_type) +__host__ __device__ void test_impl(cuda::std::true_type) { - assert(cuda::std::numeric_limits::signaling_NaN() == T()); - assert(cuda::std::numeric_limits::signaling_NaN() == T()); - assert(cuda::std::numeric_limits::signaling_NaN() == T()); - assert(cuda::std::numeric_limits::signaling_NaN() == T()); + assert(is_nan(cuda::std::numeric_limits::signaling_NaN())); + assert(is_nan(cuda::std::numeric_limits::signaling_NaN())); + assert(is_nan(cuda::std::numeric_limits::signaling_NaN())); + assert(is_nan(cuda::std::numeric_limits::signaling_NaN())); } template -__host__ __device__ inline void test() +__host__ __device__ bool equal_to(T x, T y) { - constexpr bool is_float = cuda::std::is_floating_point::value || cuda::std::__is_extended_floating_point::value; + return x == y; +} + +#if _CCCL_HAS_NVFP8() +__host__ __device__ bool equal_to(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y) +{ + return x.__x == y.__x; +} - test_imp(cuda::std::integral_constant{}); +__host__ __device__ bool equal_to(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y) +{ + return x.__x == y.__x; +} +#endif // _CCCL_HAS_NVFP8() + +template +__host__ __device__ void test_impl(cuda::std::false_type) +{ + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); + assert(equal_to(cuda::std::numeric_limits::signaling_NaN(), T())); +} + +template +__host__ __device__ inline void test() +{ + test_impl(cuda::std::integral_constant::has_signaling_NaN>{}); } int main(int, char**) @@ -80,6 +114,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3>(); + test<__nv_fp8_e5m2>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp index 70cde2711a1..38dec8c872b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp @@ -60,6 +60,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp index 7dd7eee68cc..55d7eb990db 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp @@ -66,6 +66,10 @@ int main(int, char**) #if defined(_LIBCUDACXX_HAS_NVBF16) test<__nv_bfloat16, false>(); #endif // _LIBCUDACXX_HAS_NVBF16 +#if _CCCL_HAS_NVFP8() + test<__nv_fp8_e4m3, false>(); + test<__nv_fp8_e5m2, false>(); +#endif // _CCCL_HAS_NVFP8() return 0; } From 5511c7e0e464be00dba5b56c612874576e3b2090 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 18:14:23 +0100 Subject: [PATCH 6/8] Backport to 2.8: Deprecate thrust universal iterator categories (#3461) (#3471) --- .../iterator/detail/universal_categories.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/thrust/thrust/iterator/detail/universal_categories.h b/thrust/thrust/iterator/detail/universal_categories.h index ea30f076beb..ee620f977a8 100644 --- a/thrust/thrust/iterator/detail/universal_categories.h +++ b/thrust/thrust/iterator/detail/universal_categories.h @@ -27,13 +27,13 @@ #endif // no system header #include -// XXX eliminate this file - +_CCCL_SUPPRESS_DEPRECATED_PUSH THRUST_NAMESPACE_BEGIN // define these types without inheritance to avoid ambiguous conversion to base classes -struct input_universal_iterator_tag +// deprecated [Since 2.8] +struct CCCL_DEPRECATED input_universal_iterator_tag { operator input_host_iterator_tag() { @@ -46,7 +46,8 @@ struct input_universal_iterator_tag } }; -struct output_universal_iterator_tag +// deprecated [Since 2.8] +struct CCCL_DEPRECATED output_universal_iterator_tag { operator output_host_iterator_tag() { @@ -59,7 +60,8 @@ struct output_universal_iterator_tag } }; -struct forward_universal_iterator_tag : input_universal_iterator_tag +// deprecated [Since 2.8] +struct CCCL_DEPRECATED forward_universal_iterator_tag : input_universal_iterator_tag { operator forward_host_iterator_tag() { @@ -72,7 +74,8 @@ struct forward_universal_iterator_tag : input_universal_iterator_tag }; }; -struct bidirectional_universal_iterator_tag : forward_universal_iterator_tag +// deprecated [Since 2.8] +struct CCCL_DEPRECATED bidirectional_universal_iterator_tag : forward_universal_iterator_tag { operator bidirectional_host_iterator_tag() { @@ -95,7 +98,8 @@ struct one_degree_of_separation : T } // namespace detail -struct random_access_universal_iterator_tag +// deprecated [Since 2.8] +struct CCCL_DEPRECATED random_access_universal_iterator_tag { // these conversions are all P0 operator random_access_host_iterator_tag() @@ -115,4 +119,5 @@ struct random_access_universal_iterator_tag } }; +_CCCL_SUPPRESS_DEPRECATED_POP THRUST_NAMESPACE_END From 3f8c8d5d3391856d0b27d5e0ef33d59eb167c322 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 19:40:33 +0100 Subject: [PATCH 7/8] Deprecate and replace thrust::cuda_cub iterators (#3422) (#3510) --- thrust/thrust/system/cuda/detail/count.h | 3 ++- thrust/thrust/system/cuda/detail/extrema.h | 12 ++++++---- thrust/thrust/system/cuda/detail/find.h | 8 ++++--- .../thrust/system/cuda/detail/inner_product.h | 15 +++++------- thrust/thrust/system/cuda/detail/mismatch.h | 20 +++++++--------- .../system/cuda/detail/transform_scan.h | 7 +++--- thrust/thrust/system/cuda/detail/util.h | 24 ++++++++++++------- 7 files changed, 48 insertions(+), 41 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/count.h b/thrust/thrust/system/cuda/detail/count.h index cb9b7017902..530682ba717 100644 --- a/thrust/thrust/system/cuda/detail/count.h +++ b/thrust/thrust/system/cuda/detail/count.h @@ -40,6 +40,7 @@ # include # include +# include # include # include @@ -52,7 +53,7 @@ typename iterator_traits::difference_type _CCCL_HOST_DEVICE count_if(execution_policy& policy, InputIt first, InputIt last, UnaryPred unary_pred) { using size_type = typename iterator_traits::difference_type; - using flag_iterator_t = transform_input_iterator_t; + using flag_iterator_t = transform_iterator; return cuda_cub::reduce_n( policy, flag_iterator_t(first, unary_pred), thrust::distance(first, last), size_type(0), plus()); diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index f10c3578173..617eb8bbc79 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -45,6 +45,8 @@ # include # include # include +# include +# include # include # include # include @@ -370,10 +372,10 @@ element(execution_policy& policy, ItemsIt first, ItemsIt last, BinaryPr IndexType num_items = static_cast(thrust::distance(first, last)); - using iterator_tuple = tuple>; + using iterator_tuple = tuple>; using zip_iterator = zip_iterator; - iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); + iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator(0)); using arg_min_t = ArgFunctor; using T = tuple; @@ -443,15 +445,15 @@ minmax_element(execution_policy& policy, ItemsIt first, ItemsIt last, B const auto num_items = static_cast(thrust::distance(first, last)); - using iterator_tuple = tuple>; + using iterator_tuple = tuple>; using zip_iterator = zip_iterator; - iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); + iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator(0)); using arg_minmax_t = __extrema::arg_minmax_f; using two_pairs_type = typename arg_minmax_t::two_pairs_type; using duplicate_t = typename arg_minmax_t::duplicate_tuple; - using transform_t = transform_input_iterator_t; + using transform_t = transform_iterator; zip_iterator begin = make_zip_iterator(iter_tuple); two_pairs_type result = __extrema::extrema( diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h index de633c73ebb..4b4e11c8ed9 100644 --- a/thrust/thrust/system/cuda/detail/find.h +++ b/thrust/thrust/system/cuda/detail/find.h @@ -41,6 +41,8 @@ # include # include +# include +# include # include THRUST_NAMESPACE_BEGIN @@ -116,11 +118,11 @@ find_if_n(execution_policy& policy, InputIt first, Size num_items, Pred const Size interval_size = (thrust::min)(interval_threshold, num_items); // force transform_iterator output to bool - using XfrmIterator = transform_input_iterator_t; - using IteratorTuple = thrust::tuple>; + using XfrmIterator = transform_iterator; + using IteratorTuple = thrust::tuple>; using ZipIterator = thrust::zip_iterator; - IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator_t(0)); + IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator(0)); ZipIterator begin = thrust::make_zip_iterator(iter_tuple); ZipIterator end = begin + num_items; diff --git a/thrust/thrust/system/cuda/detail/inner_product.h b/thrust/thrust/system/cuda/detail/inner_product.h index af41c5ccda8..1c36d5e256f 100644 --- a/thrust/thrust/system/cuda/detail/inner_product.h +++ b/thrust/thrust/system/cuda/detail/inner_product.h @@ -39,15 +39,15 @@ #if _CCCL_HAS_CUDA_COMPILER # include # include +# include +# include # include - -# include +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub { - template T _CCCL_HOST_DEVICE inner_product( execution_policy& policy, @@ -58,11 +58,9 @@ T _CCCL_HOST_DEVICE inner_product( ReduceOp reduce_op, ProductOp product_op) { - using size_type = typename iterator_traits::difference_type; - size_type num_items = static_cast(thrust::distance(first1, last1)); - using binop_iterator_t = transform_pair_of_input_iterators_t; - - return cuda_cub::reduce_n(policy, binop_iterator_t(first1, first2, product_op), num_items, init, reduce_op); + const auto n = thrust::distance(first1, last1); + const auto first = make_transform_iterator(make_zip_iterator(first1, first2), make_zip_function(product_op)); + return cuda_cub::reduce_n(policy, first, n, init, reduce_op); } template @@ -71,7 +69,6 @@ inner_product(execution_policy& policy, InputIt1 first1, InputIt1 last1 { return cuda_cub::inner_product(policy, first1, last1, first2, init, plus(), multiplies()); } - } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/mismatch.h b/thrust/thrust/system/cuda/detail/mismatch.h index fc7a878b7d5..9987799faca 100644 --- a/thrust/thrust/system/cuda/detail/mismatch.h +++ b/thrust/thrust/system/cuda/detail/mismatch.h @@ -40,15 +40,14 @@ # include # include +# include # include # include - -# include +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub { - template pair _CCCL_HOST_DEVICE mismatch(execution_policy& policy, InputIt1 first1, InputIt1 last1, InputIt2 first2, BinaryPred binary_pred); @@ -69,15 +68,12 @@ template pair _CCCL_HOST_DEVICE mismatch(execution_policy& policy, InputIt1 first1, InputIt1 last1, InputIt2 first2, BinaryPred binary_pred) { - using transform_t = transform_pair_of_input_iterators_t; - - transform_t transform_first = transform_t(first1, first2, binary_pred); - - transform_t result = cuda_cub::find_if_not( - policy, transform_first, transform_first + thrust::distance(first1, last1), ::cuda::std::__identity{}); - - return thrust::make_pair(first1 + thrust::distance(transform_first, result), - first2 + thrust::distance(transform_first, result)); + const auto n = thrust::distance(first1, last1); + const auto first = make_zip_iterator(first1, first2); + const auto last = make_zip_iterator(last1, first2 + n); + const auto mismatch_pos = cuda_cub::find_if_not(policy, first, last, make_zip_function(binary_pred)); + const auto dist = thrust::distance(first, mismatch_pos); + return thrust::make_pair(first1 + dist, first2 + dist); } template diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h index ed94edc7d47..2227249ab16 100644 --- a/thrust/thrust/system/cuda/detail/transform_scan.h +++ b/thrust/thrust/system/cuda/detail/transform_scan.h @@ -39,6 +39,7 @@ #if _CCCL_HAS_CUDA_COMPILER # include # include +# include # include # include @@ -66,7 +67,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan( using size_type = typename iterator_traits::difference_type; size_type num_items = static_cast(thrust::distance(first, last)); - using transformed_iterator_t = transform_input_iterator_t; + using transformed_iterator_t = transform_iterator; return cuda_cub::inclusive_scan_n(policy, transformed_iterator_t(first, transform_op), num_items, result, scan_op); } @@ -87,7 +88,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan( using size_type = typename iterator_traits::difference_type; size_type num_items = static_cast(thrust::distance(first, last)); - using transformed_iterator_t = transform_input_iterator_t; + using transformed_iterator_t = transform_iterator; return cuda_cub::inclusive_scan_n( policy, transformed_iterator_t(first, transform_op), num_items, result, init, scan_op); @@ -108,7 +109,7 @@ OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan( using size_type = typename iterator_traits::difference_type; size_type num_items = static_cast(thrust::distance(first, last)); - using transformed_iterator_t = transform_input_iterator_t; + using transformed_iterator_t = transform_iterator; return cuda_cub::exclusive_scan_n( policy, transformed_iterator_t(first, transform_op), num_items, result, init, scan_op); diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h index 4cdde4508d0..27f0e1af288 100644 --- a/thrust/thrust/system/cuda/detail/util.h +++ b/thrust/thrust/system/cuda/detail/util.h @@ -249,12 +249,13 @@ _CCCL_HOST_DEVICE inline void throw_on_error(cudaError_t status, char const* msg } } -// FIXME: Move the iterators elsewhere. - +// deprecated [Since 2.8] template -struct transform_input_iterator_t +struct CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator") transform_input_iterator_t { - using self_t = transform_input_iterator_t; + _CCCL_SUPPRESS_DEPRECATED_PUSH + using self_t = transform_input_iterator_t; + _CCCL_SUPPRESS_DEPRECATED_POP using difference_type = typename iterator_traits::difference_type; using value_type = ValueType; using pointer = void; @@ -358,10 +359,14 @@ struct transform_input_iterator_t } }; // struct transform_input_iterarot_t +// deprecated [Since 2.8] template -struct transform_pair_of_input_iterators_t +struct CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator of a thrust::zip_iterator") + transform_pair_of_input_iterators_t { - using self_t = transform_pair_of_input_iterators_t; + _CCCL_SUPPRESS_DEPRECATED_PUSH + using self_t = transform_pair_of_input_iterators_t; + _CCCL_SUPPRESS_DEPRECATED_POP using difference_type = typename iterator_traits::difference_type; using value_type = ValueType; using pointer = void; @@ -488,10 +493,13 @@ struct CCCL_DEPRECATED_BECAUSE("Use cuda::std::identity") identity } }; +// deprecated [Since 2.8] template -struct counting_iterator_t +struct CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator") counting_iterator_t { - using self_t = counting_iterator_t; + _CCCL_SUPPRESS_DEPRECATED_PUSH + using self_t = counting_iterator_t; + _CCCL_SUPPRESS_DEPRECATED_POP using difference_type = T; using value_type = T; using pointer = void; From d19f3426eafbc742e047528681f07583e17255b7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 21:26:05 +0100 Subject: [PATCH 8/8] Deprecate thrust macros from type_deduction.h (#3501) (#3511) --- thrust/thrust/detail/type_deduction.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/thrust/thrust/detail/type_deduction.h b/thrust/thrust/detail/type_deduction.h index a1d41de9676..717edc366ee 100644 --- a/thrust/thrust/detail/type_deduction.h +++ b/thrust/thrust/detail/type_deduction.h @@ -30,16 +30,18 @@ /// \def THRUST_MVCAP(x) /// \brief Capture `x` into a lambda by moving. -/// +/// deprecated [Since 2.8] #define THRUST_MVCAP(x) x = ::std::move(x) /// \def THRUST_RETOF(invocable, ...) /// \brief Expands to the type returned by invoking an instance of the invocable /// type \a invocable with parameters of type \c __VA_ARGS__. Must /// be called with 1 or fewer parameters to the invocable. -/// -#define THRUST_RETOF(...) THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__) -#define THRUST_RETOF1(C) decltype(::std::declval()()) +/// deprecated [Since 2.8] +#define THRUST_RETOF(...) THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__) +/// deprecated [Since 2.8] +#define THRUST_RETOF1(C) decltype(::std::declval()()) +/// deprecated [Since 2.8] #define THRUST_RETOF2(C, V) decltype(::std::declval()(::std::declval())) /// \def THRUST_RETURNS(...) @@ -88,6 +90,7 @@ } \ /**/ #else +/// deprecated [Since 2.8] # define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...) \ noexcept(noexcept(__VA_ARGS__))->typename std::enable_if::type \ { \