From 0c17dbd005a934ffe2f83cf0b73a6a9aa5383852 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:28:19 -0800 Subject: [PATCH 01/15] Deprecate and replace `CUB_IS_INT128_ENABLED` (#3427) Co-authored-by: Bernhard Manfred Gruber --- cub/cub/detail/fast_modulo_division.cuh | 6 +++--- .../device/dispatch/dispatch_histogram.cuh | 12 +++++------ .../tuning/tuning_run_length_encode.cuh | 8 ++++---- .../device/dispatch/tuning/tuning_scan.cuh | 4 ++-- .../dispatch/tuning/tuning_scan_by_key.cuh | 20 +++++++++---------- .../dispatch/tuning/tuning_select_if.cuh | 16 +++++++-------- cub/cub/util_ptx.cuh | 2 +- cub/cub/util_type.cuh | 13 ++---------- .../catch2_test_device_for_each_in_extents.cu | 4 ++-- cub/test/catch2_test_printing.cu | 2 +- cub/test/internal/catch2_test_fast_div_mod.cu | 2 +- cub/test/test_util.h | 2 +- 12 files changed, 41 insertions(+), 50 deletions(-) diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index 4a5f2048e32..09068d87be0 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -38,7 +38,7 @@ #endif // no system header #include // implicit_prom_t -#include // CUB_IS_INT128_ENABLED +#include // _CCCL_HAS_INT128() #include // cuda::std::ceil_div #include // std::has_single_bit @@ -79,7 +79,7 @@ struct larger_unsigned_type using type = ::cuda::std::uint64_t; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct larger_unsigned_type::type> @@ -87,7 +87,7 @@ struct larger_unsigned_type using type = __uint128_t; }; -#endif // CUB_IS_INT128_ENABLED +#endif // _CCCL_HAS_INT128() template using larger_unsigned_type_t = typename larger_unsigned_type::type; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2ac4e160220..2c2d0a2a9ca 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -646,27 +646,27 @@ public: using IntArithmeticT = ::cuda::std::_If< // sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), // uint32_t, // -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If< // (::cuda::std::is_same::value || // ::cuda::std::is_same::value), // CommonT, // uint64_t> // -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv uint64_t -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() >; // Alias template that excludes __[u]int128 from the integral types template using is_integral_excl_int128 = -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If<::cuda::std::is_same::value&& ::cuda::std::is_same::value, ::cuda::std::false_type, ::cuda::std::is_integral>; -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv ::cuda::std::is_integral; -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() union ScaleT { diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index d938209dcf2..12f07f3f366 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -156,7 +156,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -216,7 +216,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -349,7 +349,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -414,7 +414,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 7b076507341..165a17cae52 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -175,7 +175,7 @@ struct sm80_tuning struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> { @@ -221,7 +221,7 @@ template struct sm90_tuning struct sm90_tuning : sm90_tuning_vals {}; template <> struct sm90_tuning : sm90_tuning_vals {}; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index f8e29201eea..2bc31ef6697 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -172,7 +172,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -229,7 +229,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -286,7 +286,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -343,7 +343,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -400,7 +400,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -465,7 +465,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -522,7 +522,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -579,7 +579,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -636,7 +636,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -693,7 +693,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 10d22286068..c1b74b4ae09 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -121,7 +121,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -174,7 +174,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -227,7 +227,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -280,7 +280,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -336,7 +336,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -389,7 +389,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -442,7 +442,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -495,7 +495,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 99beeed313e..e6bb45c4a31 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -99,7 +99,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type return (source >> bit_start) & MASK; } -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() /** * Bitfield-extract for 128-bit types. */ diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 4d1db99a821..a89cd159309 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -76,17 +76,8 @@ _CCCL_DIAG_POP CUB_NAMESPACE_BEGIN #ifndef CUB_IS_INT128_ENABLED -# if defined(__CUDACC_RTC__) -# if defined(__CUDACC_RTC_INT128__) -# define CUB_IS_INT128_ENABLED 1 -# endif // !defined(__CUDACC_RTC_INT128__) -# else // !defined(__CUDACC_RTC__) -# if _CCCL_CUDACC_AT_LEAST(11, 5) -# if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) -# define CUB_IS_INT128_ENABLED 1 -# endif // GCC || CLANG || NVHPC -# endif // _CCCL_CUDACC_AT_LEAST(11, 5) -# endif // !defined(__CUDACC_RTC__) +// Deprecated [Since 2.8] +# define CUB_IS_INT128_ENABLED _CCCL_HAS_INT128() #endif // !defined(CUB_IS_INT128_ENABLED) /****************************************************************************** diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 3e5a6c6689a..313b9e58b38 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -107,7 +107,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t @@ -120,7 +120,7 @@ using index_types_dynamic = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/catch2_test_printing.cu b/cub/test/catch2_test_printing.cu index 6f93515114a..63b622f3554 100644 --- a/cub/test/catch2_test_printing.cu +++ b/cub/test/catch2_test_printing.cu @@ -11,7 +11,7 @@ std::string print(T val) return ss.str(); } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() TEST_CASE("Test utils can print __int128", "[test][utils]") { REQUIRE(print(__int128_t{0}) == "0"); diff --git a/cub/test/internal/catch2_test_fast_div_mod.cu b/cub/test/internal/catch2_test_fast_div_mod.cu index 8a1a3e96a27..ec3b5e20d68 100644 --- a/cub/test/internal/catch2_test_fast_div_mod.cu +++ b/cub/test/internal/catch2_test_fast_div_mod.cu @@ -42,7 +42,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 031298120dc..9a5fefcc69c 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -717,7 +717,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< return os; } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() inline std::ostream& operator<<(std::ostream& os, __uint128_t val) { constexpr int max_digits = 40; From c02e845e7f40dc748777638ce70e9893560e473c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 30 Jan 2025 07:39:35 +0100 Subject: [PATCH 02/15] Adds support for large num items to `DeviceMerge` (#3530) * adds support for large num items * re-enable vsmem tests * rephrases test description --- cub/cub/device/device_merge.cuh | 18 ++-- cub/test/catch2_test_device_merge.cu | 129 +++++---------------------- 2 files changed, 33 insertions(+), 114 deletions(-) diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh index 7135546a0e6..814bad75248 100644 --- a/cub/cub/device/device_merge.cuh +++ b/cub/cub/device/device_merge.cuh @@ -76,16 +76,19 @@ struct DeviceMerge void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, - int num_keys1, + ::cuda::std::int64_t num_keys1, KeyIteratorIn2 keys_in2, - int num_keys2, + ::cuda::std::int64_t num_keys2, KeyIteratorOut keys_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge:: - dispatch_t:: + dispatch_t:: dispatch( d_temp_storage, temp_storage_bytes, @@ -161,16 +164,19 @@ struct DeviceMerge std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, - int num_pairs1, + ::cuda::std::int64_t num_pairs1, KeyIteratorIn2 keys_in2, ValueIteratorIn2 values_in2, - int num_pairs2, + ::cuda::std::int64_t num_pairs2, KeyIteratorOut keys_out, ValueIteratorOut values_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge::dispatch_t< KeyIteratorIn1, ValueIteratorIn1, @@ -178,7 +184,7 @@ struct DeviceMerge ValueIteratorIn2, KeyIteratorOut, ValueIteratorOut, - int, + offset_t, CompareOp>::dispatch(d_temp_storage, temp_storage_bytes, keys_in1, diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu index ae0d3f84baa..4835f597710 100644 --- a/cub/test/catch2_test_device_merge.cu +++ b/cub/test/catch2_test_device_merge.cu @@ -20,103 +20,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs); DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys); -// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different -// offset types -namespace detail -{ -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - Offset num_keys1, - KeyIteratorIn2 keys_in2, - Offset num_keys2, - KeyIteratorOut keys_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - cub::NullType*, - KeyIteratorIn2, - cub::NullType*, - KeyIteratorOut, - cub::NullType*, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - nullptr, - num_keys1, - keys_in2, - nullptr, - num_keys2, - keys_out, - nullptr, - compare_op, - stream); -} - -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - ValueIteratorIn1 values_in1, - Offset num_pairs1, - KeyIteratorIn2 keys_in2, - ValueIteratorIn2 values_in2, - Offset num_pairs2, - KeyIteratorOut keys_out, - ValueIteratorOut values_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - ValueIteratorIn1, - KeyIteratorIn2, - ValueIteratorIn2, - KeyIteratorOut, - ValueIteratorOut, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - values_in1, - num_pairs1, - keys_in2, - values_in2, - num_pairs2, - keys_out, - values_out, - compare_op, - stream); -} -} // namespace detail - -DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type); -DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type); - using types = c2h::type_list; -// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time -using offset_types = c2h::type_list; - template , @@ -223,11 +128,27 @@ C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_ }); } -C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types) +C2H_TEST("DeviceMerge::MergeKeys works for large number of items", "[merge][device]") + +try +{ + using key_t = char; + using offset_t = int64_t; + + // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items + const auto num_items_int_max = static_cast(::cuda::std::numeric_limits::max()); + + // Generate the input sizes to test for + const offset_t num_items_lhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max - 1, offset_t{3}})); + const offset_t num_items_rhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max, offset_t{3}})); + + test_keys(num_items_lhs, num_items_rhs, ::cuda::std::less<>{}); +} +catch (const std::bad_alloc&) { - using key_t = int; - using offset_t = c2h::get<0, TestType>; - test_keys(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type); + // allocation failure is not a test failure, so we can run tests on smaller GPUs } C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]") @@ -385,14 +306,6 @@ C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types) test_pairs(); } -C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types) -{ - using key_t = int; - using value_t = int; - using offset_t = c2h::get<0, TestType>; - test_pairs(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); -} - C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]") { using key_t = int; @@ -410,7 +323,7 @@ try using key_t = char; using value_t = char; const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33); - test_pairs(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); + test_pairs(size, size, ::cuda::std::less<>{}); } catch (const std::bad_alloc&) { From a654bc6e0fec3937ddd597dc44adaec61a40701f Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 08:33:22 +0100 Subject: [PATCH 03/15] Support FP16 traits on CTK 12.0 (#3535) * Support FP16 traits on CTK 12.0 * Only enable constexpr limits when supported * Support float_eq on CTK < 12.2 --- .../is_extended_floating_point.h | 16 ++--- libcudacxx/include/cuda/std/limits | 58 ++++++++++++------- .../meta.unary.cat/is_floating_point.pass.cpp | 8 +-- .../limits/is_specialized.pass.cpp | 8 +-- .../limits/numeric.limits.members/common.h | 25 ++++++-- .../const_data_members.pass.cpp | 8 +-- .../denorm_min.pass.cpp | 8 +-- .../numeric.limits.members/digits.pass.cpp | 8 +-- .../numeric.limits.members/digits10.pass.cpp | 8 +-- .../numeric.limits.members/epsilon.pass.cpp | 8 +-- .../has_denorm.pass.cpp | 8 +-- .../has_denorm_loss.pass.cpp | 8 +-- .../has_infinity.pass.cpp | 8 +-- .../has_quiet_NaN.pass.cpp | 8 +-- .../has_signaling_NaN.pass.cpp | 8 +-- .../numeric.limits.members/infinity.pass.cpp | 16 ++--- .../is_bounded.pass.cpp | 8 +-- .../numeric.limits.members/is_exact.pass.cpp | 8 +-- .../numeric.limits.members/is_iec559.pass.cpp | 8 +-- .../is_integer.pass.cpp | 8 +-- .../numeric.limits.members/is_modulo.pass.cpp | 8 +-- .../numeric.limits.members/is_signed.pass.cpp | 8 +-- .../numeric.limits.members/lowest.pass.cpp | 8 +-- .../numeric.limits.members/max.pass.cpp | 8 +-- .../max_digits10.pass.cpp | 8 +-- .../max_exponent.pass.cpp | 8 +-- .../max_exponent10.pass.cpp | 8 +-- .../numeric.limits.members/min.pass.cpp | 8 +-- .../min_exponent.pass.cpp | 8 +-- .../min_exponent10.pass.cpp | 8 +-- .../numeric.limits.members/quiet_NaN.pass.cpp | 8 +-- .../numeric.limits.members/radix.pass.cpp | 8 +-- .../round_error.pass.cpp | 8 +-- .../round_style.pass.cpp | 8 +-- .../signaling_NaN.pass.cpp | 8 +-- .../tinyness_before.pass.cpp | 8 +-- .../numeric.limits.members/traps.pass.cpp | 8 +-- 37 files changed, 205 insertions(+), 174 deletions(-) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h index b9700a87066..040418f5fe7 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h @@ -22,16 +22,16 @@ #include -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) # include -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") # include _CCCL_DIAG_POP -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() # include @@ -53,7 +53,7 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v # endif // !_CCCL_NO_INLINE_VARIABLES #endif // !_CCCL_NO_VARIABLE_TEMPLATES -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) template <> struct __is_extended_floating_point<__half> : true_type {}; @@ -62,9 +62,9 @@ struct __is_extended_floating_point<__half> : true_type template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true; # endif // !_CCCL_NO_INLINE_VARIABLES -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) template <> struct __is_extended_floating_point<__nv_bfloat16> : true_type {}; @@ -73,7 +73,7 @@ struct __is_extended_floating_point<__nv_bfloat16> : true_type template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_bfloat16> = true; # endif // !_CCCL_NO_INLINE_VARIABLES -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() template <> diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits index 9d0cbc81108..29f4bf24ec3 100644 --- a/libcudacxx/include/cuda/std/limits +++ b/libcudacxx/include/cuda/std/limits @@ -608,7 +608,13 @@ public: #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE }; -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) +# ifdef _LIBCUDACXX_HAS_NVFP16 +# define _LIBCUDACXX_FP16_CONSTEXPR constexpr +# else //_LIBCUDACXX_HAS_NVFP16 +# define _LIBCUDACXX_FP16_CONSTEXPR +# endif //_LIBCUDACXX_HAS_NVFP16 + template <> class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point> { @@ -621,15 +627,15 @@ public: static constexpr int digits = 11; static constexpr int digits10 = 3; static constexpr int max_digits10 = 5; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type min() noexcept { return type(__half_raw{0x0400u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type max() noexcept { return type(__half_raw{0x7bffu}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type lowest() noexcept { return type(__half_raw{0xfbffu}); } @@ -637,11 +643,11 @@ public: static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type epsilon() noexcept { return type(__half_raw{0x1400u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type round_error() noexcept { return type(__half_raw{0x3800u}); } @@ -656,19 +662,19 @@ public: static constexpr bool has_signaling_NaN = true; static constexpr float_denorm_style has_denorm = denorm_present; static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type infinity() noexcept { return type(__half_raw{0x7c00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type quiet_NaN() noexcept { return type(__half_raw{0x7e00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type signaling_NaN() noexcept { return type(__half_raw{0x7d00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type denorm_min() noexcept { return type(__half_raw{0x0001u}); } @@ -681,9 +687,16 @@ public: static constexpr bool tinyness_before = false; static constexpr float_round_style round_style = round_to_nearest; }; -#endif // _LIBCUDACXX_HAS_NVFP16 +# undef _LIBCUDACXX_FP16_CONSTEXPR +#endif // _CCCL_HAS_NVFP16 + +#if defined(_CCCL_HAS_NVBF16) +# ifdef _LIBCUDACXX_HAS_NVBF16 +# define _LIBCUDACXX_BF16_CONSTEXPR constexpr +# else //_LIBCUDACXX_HAS_NVBF16 +# define _LIBCUDACXX_BF16_CONSTEXPR +# endif //_LIBCUDACXX_HAS_NVBF16 -#if defined(_LIBCUDACXX_HAS_NVBF16) template <> class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point> { @@ -696,15 +709,15 @@ public: static constexpr int digits = 8; static constexpr int digits10 = 2; static constexpr int max_digits10 = 4; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type min() noexcept { return type(__nv_bfloat16_raw{0x0080u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type max() noexcept { return type(__nv_bfloat16_raw{0x7f7fu}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type lowest() noexcept { return type(__nv_bfloat16_raw{0xff7fu}); } @@ -712,11 +725,11 @@ public: static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type epsilon() noexcept { return type(__nv_bfloat16_raw{0x3c00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type round_error() noexcept { return type(__nv_bfloat16_raw{0x3f00u}); } @@ -731,19 +744,19 @@ public: static constexpr bool has_signaling_NaN = true; static constexpr float_denorm_style has_denorm = denorm_present; static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type infinity() noexcept { return type(__nv_bfloat16_raw{0x7f80u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type quiet_NaN() noexcept { return type(__nv_bfloat16_raw{0x7fc0u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type signaling_NaN() noexcept { return type(__nv_bfloat16_raw{0x7fa0u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type denorm_min() noexcept { return type(__nv_bfloat16_raw{0x0001u}); } @@ -756,7 +769,8 @@ public: static constexpr bool tinyness_before = false; static constexpr float_round_style round_style = round_to_nearest; }; -#endif // _LIBCUDACXX_HAS_NVBF16 +# undef _LIBCUDACXX_BF16_CONSTEXPR +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() # if defined(_CCCL_BUILTIN_BIT_CAST) || _CCCL_STD_VER >= 2014 diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp index b0b7a3f3b69..5a04070c598 100644 --- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp @@ -80,12 +80,12 @@ int main(int, char**) test_is_floating_point(); test_is_floating_point(); test_is_floating_point(); -#ifdef _LIBCUDACXX_HAS_NVFP16 +#ifdef _CCCL_HAS_NVFP16 test_is_floating_point<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#ifdef _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVFP16 +#ifdef _CCCL_HAS_NVBF16 test_is_floating_point<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test_is_floating_point<__nv_fp8_e4m3>(); test_is_floating_point<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp index 7113c0e2772..adb30091033 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp @@ -68,12 +68,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 static_assert(!cuda::std::numeric_limits>::is_specialized, "!cuda::std::numeric_limits >::is_specialized"); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h index 8400071611c..7d15f2ba6b6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h @@ -17,6 +17,7 @@ #define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1 #define __CUDA_NO_BFLOAT16_OPERATORS__ 1 +#include #include template @@ -42,27 +43,43 @@ __host__ __device__ inline __nv_fp8_e5m2 make_fp8_e5m2(double x, __nv_saturation __host__ __device__ inline bool float_eq(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E4M3)}); +# else + return ::cuda::std::bit_cast(x) == ::cuda::std::bit_cast(y); +# endif } __host__ __device__ inline bool float_eq(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E5M2)}); +# else + return ::cuda::std::bit_cast(x) == ::cuda::std::bit_cast(y); +# endif } #endif // _CCCL_HAS_NVFP8 -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) __host__ __device__ inline bool float_eq(__half x, __half y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return __heq(x, y); +# else + return __half2float(x) == __half2float(y); +# endif } -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) __host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return __heq(x, y); +# else + return __bfloat162float(x) == __bfloat162float(y); +# endif } -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #endif // NUMERIC_LIMITS_MEMBERS_COMMON_H diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp index b095d63afcd..093b5d331be 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp @@ -110,12 +110,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test_type(); #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test_type<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test_type<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test_type<__nv_fp8_e4m3>(); test_type<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp index 475f41a3388..9ea232eaad6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) test(LDBL_TRUE_MIN); # endif #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(5.9604644775390625e-08)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.001953125)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.0000152587890625)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp index 0d3c910b672..01f6b05543b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp @@ -55,12 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 11>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 8>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 3>(); test<__nv_fp8_e5m2, 2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp index bd66aeecfeb..24c53725738 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp @@ -74,12 +74,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp index 15366bdf308..bb65847df33 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp @@ -57,12 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_EPSILON); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(0.0009765625)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.0078125)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.125)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.25)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp index 8fa506b93ce..8d9881580bf 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, cuda::std::denorm_present>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, cuda::std::denorm_present>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, cuda::std::denorm_present>(); test<__nv_fp8_e5m2, cuda::std::denorm_present>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp index 3b7722acd8b..5a046a9b339 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp index ebddcb4421e..768e53d1c88 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp index 908f2d7fa4a..4c3e11a9b05 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp index 62d81c8a524..1b80d1869e6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp index 627105a4a8c..8dd611556c5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp @@ -64,12 +64,12 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(1. / 0.); # endif -# if defined(_LIBCUDACXX_HAS_NVFP16) +# if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(1.0 / 0.0)); -# endif // _LIBCUDACXX_HAS_NVFP16 -# if defined(_LIBCUDACXX_HAS_NVBF16) +# endif // _CCCL_HAS_NVFP16 +# if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0)); -# endif // _LIBCUDACXX_HAS_NVBF16 +# endif // _CCCL_HAS_NVBF16 # if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); test<__nv_fp8_e5m2>(make_fp8_e5m2(1.0 / 0.0)); @@ -81,12 +81,12 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(INFINITY); # endif -# if defined(_LIBCUDACXX_HAS_NVFP16) +# if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(INFINITY)); -# endif // _LIBCUDACXX_HAS_NVFP16 -# if defined(_LIBCUDACXX_HAS_NVBF16) +# endif // _CCCL_HAS_NVFP16 +# if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(INFINITY)); -# endif // _LIBCUDACXX_HAS_NVBF16 +# endif // _CCCL_HAS_NVBF16 # if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); test<__nv_fp8_e5m2>(make_fp8_e5m2(INFINITY)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp index eeb9740e4e2..e28ab8313b6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp index c3c2e027c72..e6038f1589b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp index 7bab40e8826..1ff809bad09 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp index 68e7437f1e0..eed9d38c050 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp index 992be2b18b7..fc3ca9dbb4e 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp index be7e4f235a7..54005f6c0b9 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp index 6a8b2a9c181..72190bd2ad7 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(-LDBL_MAX); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(-65504.0)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(-448.0)); test<__nv_fp8_e5m2>(make_fp8_e5m2(-57344.0)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp index a1582e41b22..5039f773a2f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp @@ -65,12 +65,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MAX); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(65504.0)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(448.0)); test<__nv_fp8_e5m2>(make_fp8_e5m2(57344.0)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp index d01a4aa099c..309279bc79c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp @@ -69,12 +69,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp index 3027e9f06f5..606e9c52b7f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 16>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 128>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 8>(); test<__nv_fp8_e5m2, 15>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp index 5924aee173d..61145deec86 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 4>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 38>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 2>(); test<__nv_fp8_e5m2, 4>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp index 15f470909df..ccab08a38f5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MIN); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(6.103515625e-05)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.015625)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.000061035)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp index b63d653a7c3..c942a6288be 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, -13>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, -125>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, -6>(); test<__nv_fp8_e5m2, -15>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp index a6ff20e7fde..e9b6f29d25f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, -4>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, -37>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, -2>(); test<__nv_fp8_e5m2, -5>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp index 2d6d9582f5c..a8b076fbeee 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp @@ -108,12 +108,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp index 7e5c87927aa..dd15c391180 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp @@ -55,12 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, FLT_RADIX>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, FLT_RADIX>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, FLT_RADIX>(); test<__nv_fp8_e5m2, FLT_RADIX>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp index d4faf373a09..95ed80eb951 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp @@ -57,12 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(0.5); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(0.5)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.5)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.5)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.5)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp index 8515581d650..1eb5c0b0f5a 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, cuda::std::round_to_nearest>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, cuda::std::round_to_nearest>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, cuda::std::round_to_nearest>(); test<__nv_fp8_e5m2, cuda::std::round_to_nearest>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp index 19ace1b3d2c..0ec70976b32 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp @@ -108,12 +108,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp index 38dec8c872b..1da28874b06 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp index 55d7eb990db..4cb627a4b77 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp @@ -60,12 +60,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); From b6209e841a72eb7def4ba2aace30eff8a9b539a4 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 30 Jan 2025 09:06:31 +0100 Subject: [PATCH 04/15] Suppress execution checks for vocabulary types (#3578) * Suppress execution checks for optional * Suppress execution checks for `expected` * Suppress execution checks for `pair` * Suppress execution checks for `variant` --- .../cuda/std/__expected/bad_expected_access.h | 21 +- .../include/cuda/std/__expected/expected.h | 20 ++ .../cuda/std/__expected/expected_base.h | 18 ++ .../include/cuda/std/__expected/unexpected.h | 7 + .../include/cuda/std/__memory/construct_at.h | 1 + libcudacxx/include/cuda/std/__utility/pair.h | 19 +- .../cuda/std/detail/libcxx/include/optional | 25 +++ .../cuda/std/detail/libcxx/include/tuple | 2 + .../cuda/std/detail/libcxx/include/variant | 20 ++ .../expected/device_only_types.pass.cpp | 201 ++++++++++++++++++ .../expected/host_only_types.pass.cpp | 199 +++++++++++++++++ .../optional/device_only_types.pass.cpp | 136 ++++++++++++ .../optional/host_only_types.pass.cpp | 134 ++++++++++++ .../tuple/device_only_types.pass.cpp | 81 +++++++ .../tuple/forward_as_tuple_interop.pass.cpp | 0 .../utilities/tuple/host_only_types.pass.cpp | 90 ++++++++ .../tuple/vector_types_get.pass.cpp | 0 .../vector_types_structured_bindings.pass.cpp | 0 .../tuple/vector_types_tuple_element.pass.cpp | 0 .../tuple/vector_types_tuple_size.pass.cpp | 0 .../unexpected/device_only_types.pass.cpp | 82 +++++++ .../unexpected/host_only_types.pass.cpp | 85 ++++++++ .../utility/pair/device_only_types.pass.cpp | 93 ++++++++ .../utility/pair/host_only_types.pass.cpp | 93 ++++++++ .../pair/interop}/pair.assign.pass.cpp | 0 .../utility/pair/interop}/pair.cons.pass.cpp | 0 .../utility/pair/interop}/pair.conv.pass.cpp | 0 .../variant/device_only_types.pass.cpp | 120 +++++++++++ .../variant/host_only_types.pass.cpp | 129 +++++++++++ libcudacxx/test/support/host_device_types.h | 148 +++++++++++++ 30 files changed, 1714 insertions(+), 10 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/forward_as_tuple_interop.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_get.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_structured_bindings.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_element.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_size.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.assign.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.cons.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.conv.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp create mode 100644 libcudacxx/test/support/host_device_types.h diff --git a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h index 5600402e429..0f10f546be6 100644 --- a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h +++ b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h @@ -51,14 +51,6 @@ class bad_expected_access; template <> class bad_expected_access : public ::std::exception { -protected: - _CCCL_HIDE_FROM_ABI bad_expected_access() noexcept = default; - _CCCL_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access(bad_expected_access&&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(const bad_expected_access&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(bad_expected_access&&) = default; - ~bad_expected_access() noexcept override = default; - public: // The way this has been designed (by using a class template below) means that we'll already // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch @@ -74,10 +66,21 @@ template class bad_expected_access : public bad_expected_access { public: - explicit bad_expected_access(_Err __e) +# if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types + _CCCL_HOST_DEVICE +# endif // _CCCL_CUDA_COMPILER(CLANG) + _CCCL_HIDE_FROM_ABI explicit bad_expected_access(_Err __e) : __unex_(_CUDA_VSTD::move(__e)) {} +# if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types + _CCCL_HOST_DEVICE +# endif // _CCCL_CUDA_COMPILER(CLANG) + _CCCL_HIDE_FROM_ABI ~bad_expected_access() noexcept + { + __unex_.~_Err(); + } + _LIBCUDACXX_HIDE_FROM_ABI _Err& error() & noexcept { return __unex_; diff --git a/libcudacxx/include/cuda/std/__expected/expected.h b/libcudacxx/include/cuda/std/__expected/expected.h index cc5ddfc03f0..f618ff57c92 100644 --- a/libcudacxx/include/cuda/std/__expected/expected.h +++ b/libcudacxx/include/cuda/std/__expected/expected.h @@ -1070,6 +1070,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } // [expected.object.eq], equality operators + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) { if (__x.__has_val_ != __y.has_value()) @@ -1090,12 +1091,14 @@ class expected : private __expected_move_assign<_Tp, _Err> } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) { return !(__x == __y); } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2, class _E2) _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2))) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) @@ -1118,6 +1121,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2, class _E2) _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2))) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected<_T2, _E2>& __y) @@ -1126,6 +1130,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const _T2& __v) @@ -1133,18 +1138,21 @@ class expected : private __expected_move_assign<_Tp, _Err> return __x.__has_val_ && static_cast(__x.__union_.__val_ == __v); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const _T2& __v, const expected& __x) { return __x.__has_val_ && static_cast(__x.__union_.__val_ == __v); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const _T2& __v) { return !__x.__has_val_ || static_cast(__x.__union_.__val_ != __v); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const _T2& __v, const expected& __x) @@ -1153,22 +1161,26 @@ class expected : private __expected_move_assign<_Tp, _Err> } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e) { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __e.error()); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __e, const expected& __x) { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __e.error()); } + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __e) { return __x.__has_val_ || static_cast(__x.__union_.__unex_ != __e.error()); } + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected<_E2>& __e, const expected& __x) { @@ -1906,6 +1918,7 @@ class expected : private __expected_move_assign } // [expected.void.eq], equality operators + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept { if (__x.__has_val_ != __y.has_value()) @@ -1918,12 +1931,14 @@ class expected : private __expected_move_assign } } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept { return !(__x == __y); } # endif + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept @@ -1938,6 +1953,7 @@ class expected : private __expected_move_assign } } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept @@ -1946,22 +1962,26 @@ class expected : private __expected_move_assign } # endif + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) noexcept { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __y.error()); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __y, const expected& __x) noexcept { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __y.error()); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept { return __x.__has_val_ || static_cast(__x.__union_.__unex_ != __y.error()); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept { diff --git a/libcudacxx/include/cuda/std/__expected/expected_base.h b/libcudacxx/include/cuda/std/__expected/expected_base.h index 31de97e3f50..0de6cc29158 100644 --- a/libcudacxx/include/cuda/std/__expected/expected_base.h +++ b/libcudacxx/include/cuda/std/__expected/expected_base.h @@ -71,30 +71,35 @@ union __expected_union_t struct __empty_t {}; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2)) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2)) : __val_() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2))) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept : __empty_() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -104,6 +109,7 @@ union __expected_union_t : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...)) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -128,18 +134,21 @@ union __expected_union_t<_Tp, _Err, true> struct __empty_t {}; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2)) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2)) : __val_() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2))) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept : __empty_() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) @@ -152,6 +161,7 @@ union __expected_union_t<_Tp, _Err, true> : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -161,6 +171,7 @@ union __expected_union_t<_Tp, _Err, true> : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...)) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -436,6 +447,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> { _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, _Tp, _Err); + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -445,6 +457,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::forward<_Args>(__args)...); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES( (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND _CCCL_TRAIT(is_nothrow_move_constructible, _T1)) @@ -456,6 +469,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::move(__tmp)); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES( (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND(!_CCCL_TRAIT(is_nothrow_move_constructible, _T1))) @@ -475,6 +489,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> __trans.__complete(); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -493,6 +508,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> __with_err.__has_val_ = true; } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES((!_CCCL_TRAIT(is_nothrow_move_constructible, _Err2))) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -653,6 +669,7 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__available> : __ex _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&) = default; _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&) = default; + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy_assign& operator=(const __expected_copy_assign& __other) noexcept( _CCCL_TRAIT(is_nothrow_copy_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp) @@ -917,6 +934,7 @@ struct __expected_storage : __expected_destruct { _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, void, _Err); + _CCCL_EXEC_CHECK_DISABLE static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __swap_val_unex_impl( __expected_storage& __with_val, __expected_storage& __with_err) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err)) diff --git a/libcudacxx/include/cuda/std/__expected/unexpected.h b/libcudacxx/include/cuda/std/__expected/unexpected.h index 0f8f3784374..0da94402a85 100644 --- a/libcudacxx/include/cuda/std/__expected/unexpected.h +++ b/libcudacxx/include/cuda/std/__expected/unexpected.h @@ -73,6 +73,7 @@ class unexpected _CCCL_HIDE_FROM_ABI unexpected(const unexpected&) = default; _CCCL_HIDE_FROM_ABI unexpected(unexpected&&) = default; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Error = _Err) _CCCL_REQUIRES((!_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, unexpected) && !_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, in_place_t) @@ -82,6 +83,7 @@ class unexpected : __unex_(_CUDA_VSTD::forward<_Error>(__error)) {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...)) _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(in_place_t, _Args&&... __args) noexcept( @@ -89,6 +91,7 @@ class unexpected : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Up, class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...)) _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected( @@ -123,6 +126,7 @@ class unexpected } // [expected.un.swap] + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(unexpected& __other) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err)) { static_assert(_CCCL_TRAIT(is_swappable, _Err), "E must be swappable"); @@ -130,6 +134,7 @@ class unexpected swap(__unex_, __other.__unex_); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES(_CCCL_TRAIT(is_swappable, _Err2)) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr void @@ -140,6 +145,7 @@ class unexpected } // [expected.un.eq] + _CCCL_EXEC_CHECK_DISABLE template _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected& __lhs, @@ -148,6 +154,7 @@ class unexpected return __lhs.error() == __rhs.error(); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected& __lhs, diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h index bc231cd27d7..a78314c6479 100644 --- a/libcudacxx/include/cuda/std/__memory/construct_at.h +++ b/libcudacxx/include/cuda/std/__memory/construct_at.h @@ -50,6 +50,7 @@ # ifndef __cpp_lib_constexpr_dynamic_alloc namespace std { +_CCCL_EXEC_CHECK_DISABLE template ()) _Tp(_CUDA_VSTD::declval<_Args>()...))> diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h index e725cf4b001..e8678f58767 100644 --- a/libcudacxx/include/cuda/std/__utility/pair.h +++ b/libcudacxx/include/cuda/std/__utility/pair.h @@ -124,6 +124,7 @@ struct __pair_base _T1 first; _T2 second; + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__explicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept( @@ -132,6 +133,7 @@ struct __pair_base , second() {} + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__implicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept( @@ -140,6 +142,7 @@ struct __pair_base , second() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -163,6 +166,7 @@ struct __pair_base<_T1, _T2, true> _T1 first; _T2 second; + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__explicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept( @@ -171,6 +175,7 @@ struct __pair_base<_T1, _T2, true> , second() {} + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__implicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept( @@ -179,10 +184,13 @@ struct __pair_base<_T1, _T2, true> , second() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_HIDE_FROM_ABI constexpr __pair_base(const __pair_base&) = default; - _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&) = default; + _CCCL_EXEC_CHECK_DISABLE + _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&) = default; // We need to ensure that a reference type, which would inhibit the implicit copy assignment still works + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=( conditional_t<_CCCL_TRAIT(is_copy_assignable, _T1) && _CCCL_TRAIT(is_copy_assignable, _T2), __pair_base, __nat> const& __p) noexcept(_CCCL_TRAIT(is_nothrow_copy_assignable, _T1) && _CCCL_TRAIT(is_nothrow_copy_assignable, _T2)) @@ -193,6 +201,7 @@ struct __pair_base<_T1, _T2, true> } // We need to ensure that a reference type, which would inhibit the implicit move assignment still works + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=( conditional_t<_CCCL_TRAIT(is_move_assignable, _T1) && _CCCL_TRAIT(is_move_assignable, _T2), __pair_base, __nat>&& __p) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _T1) && _CCCL_TRAIT(is_nothrow_move_assignable, _T2)) @@ -202,6 +211,7 @@ struct __pair_base<_T1, _T2, true> return *this; } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -532,6 +542,7 @@ _CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>; // [pairs.spec], specialized algorithms +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { @@ -540,6 +551,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr common_comparison_category_t<__synth_three_way_result<_T1>, __synth_three_way_result<_T2>> @@ -554,30 +566,35 @@ operator<=>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) #else // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return !(__x == __y); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return __y < __x; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return !(__x < __y); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional index 04f056c91d3..d61ce254f4d 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional @@ -296,12 +296,14 @@ struct __optional_destruct_base<_Tp, false> , __engaged_(false) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) , __engaged_(true) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base( __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) @@ -338,12 +340,14 @@ struct __optional_destruct_base<_Tp, true> , __engaged_(false) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) , __engaged_(true) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base( __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) @@ -389,6 +393,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> return _CUDA_VSTD::move(this->__val_); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args) { @@ -410,6 +415,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> } } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr void __assign_from(_That&& __opt) { @@ -811,6 +817,7 @@ public: return this->__get(); } + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(optional& __opt) noexcept( _CCCL_TRAIT(is_nothrow_move_constructible, value_type) && _CCCL_TRAIT(is_nothrow_swappable, value_type)) { @@ -1088,6 +1095,7 @@ _CCCL_HOST_DEVICE optional(_Tp) -> optional<_Tp>; # endif // _CCCL_NO_DEDUCTION_GUIDES // Comparisons between optionals +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1105,6 +1113,7 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x == *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1122,6 +1131,7 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x != *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1139,6 +1149,7 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x < *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1156,6 +1167,7 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x > *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1173,6 +1185,7 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x <= *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), @@ -1264,6 +1277,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(nullopt_t, const optional<_T } // Comparisons with T +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1273,6 +1287,7 @@ operator==(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x == __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1282,6 +1297,7 @@ operator==(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v == *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1291,6 +1307,7 @@ operator!=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x != __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1300,6 +1317,7 @@ operator!=(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v != *__x : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1309,6 +1327,7 @@ operator<(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x < __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1318,6 +1337,7 @@ operator<(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v < *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1327,6 +1347,7 @@ operator<=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x <= __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1336,6 +1357,7 @@ operator<=(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v <= *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1345,6 +1367,7 @@ operator>(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x > __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1354,6 +1377,7 @@ operator>(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v > *__x : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), @@ -1363,6 +1387,7 @@ operator>=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x >= __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple index aa2fdeaa368..6ff1039e61b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple @@ -1124,6 +1124,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<_Tp&&...> forward_as_tuple template struct __tuple_equal { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y) { @@ -1157,6 +1158,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const tuple<_Tp. template struct __tuple_less { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant index 0f6ec9d29fc..af1f7ba85ad 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant @@ -255,6 +255,7 @@ C++20 #include #include #include +#include #include #include #include @@ -744,10 +745,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __alt { using __value_type = _Tp; + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args) : __value(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt(const __alt&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt(__alt&&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt& operator=(const __alt&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt& operator=(__alt&&) = default; + + _CCCL_EXEC_CHECK_DISABLE + ~__alt() = default; __value_type __value; }; @@ -906,6 +919,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_Availab { struct __visitor { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI void operator()(_Alt& __alt) const noexcept { @@ -1148,6 +1162,7 @@ public: } protected: + _CCCL_EXEC_CHECK_DISABLE template < size_t _Ip, class _Tp, @@ -1166,6 +1181,7 @@ protected: } } + _CCCL_EXEC_CHECK_DISABLE template < size_t _Ip, class _Tp, @@ -1896,7 +1912,11 @@ private: return __op(_CUDA_VSTD::get<0>(__lhs), _CUDA_VSTD::get<0>(__rhs)); } // We already checked that every variant has a value, so we should never reach this line +# if _CCCL_COMPILER(MSVC) // MSVC needs this to be wrapped in a function or it will error + _CUDA_VSTD::unreachable(); +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv _CCCL_UNREACHABLE(); +# endif // !_CCCL_COMPILER(MSVC) } }; diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp new file mode 100644 index 00000000000..ba972e02d3a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// We cannot suppress execution checks in cuda::std::construct_at +// XFAIL: c++20 && !nvrtc && nvcc && !msvc +// UNSUPPORTED: clang-14 + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using expected = cuda::std::expected; + { // default construction + expected default_constructed{}; + assert(default_constructed.has_value()); + assert(*default_constructed == 0); + } + + { // in_place zero initialization + expected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + expected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.has_value()); + assert(*init_list_initialization == 42); + } + + { // unexpect zero initialization + expected in_place_zero_initialization{cuda::std::unexpect}; + assert(!in_place_zero_initialization.has_value()); + assert(in_place_zero_initialization.error() == 0); + } + + { // unexpect initialization + expected in_place_initialization{cuda::std::unexpect, 42}; + assert(!in_place_initialization.has_value()); + assert(in_place_initialization.error() == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list{}, 42}; + assert(!init_list_initialization.has_value()); + assert(init_list_initialization.error() == 42); + } + + { // value initialization + expected value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + expected input{42}; + expected dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + expected input{42}; + expected dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + expected input{42}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + expected input{42}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + expected input{}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, empty to empty + expected input{}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, error to value + expected input{cuda::std::unexpect, 42}; + expected dest{1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // assignment, value to error + expected input{42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, error to error + expected input{cuda::std::unexpect, 42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // comparison with expected with value + expected lhs{42}; + expected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with expected with error + expected lhs{cuda::std::unexpect, 42}; + expected rhs{cuda::std::unexpect, 1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with type and value + expected expect{42}; + assert(expect == device_only_type{42}); + assert(device_only_type{42} == expect); + assert(expect != device_only_type{1337}); + assert(device_only_type{1337} != expect); + } + + { // comparison with type and error + expected expect{cuda::std::unexpect, 42}; + assert(expect == cuda::std::unexpected{42}); + assert(cuda::std::unexpected{42} == expect); + assert(expect != cuda::std::unexpected{1337}); + assert(cuda::std::unexpected{1337} != expect); + } + + { // swap + expected lhs{42}; + expected rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } + + { // swap cross error + expected lhs{42}; + expected rhs{cuda::std::unexpect, 1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp new file mode 100644 index 00000000000..282288b7be8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp @@ -0,0 +1,199 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using expected = cuda::std::expected; + { // default construction + expected default_constructed{}; + assert(default_constructed.has_value()); + assert(*default_constructed == 0); + } + + { // in_place zero initialization + expected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + expected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.has_value()); + assert(*init_list_initialization == 42); + } + + { // unexpect zero initialization + expected in_place_zero_initialization{cuda::std::unexpect}; + assert(!in_place_zero_initialization.has_value()); + assert(in_place_zero_initialization.error() == 0); + } + + { // unexpect initialization + expected in_place_initialization{cuda::std::unexpect, 42}; + assert(!in_place_initialization.has_value()); + assert(in_place_initialization.error() == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list{}, 42}; + assert(!init_list_initialization.has_value()); + assert(init_list_initialization.error() == 42); + } + + { // value initialization + expected value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + expected input{42}; + expected dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + expected input{42}; + expected dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + expected input{42}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + expected input{42}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + expected input{}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, empty to empty + expected input{}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, error to value + expected input{cuda::std::unexpect, 42}; + expected dest{1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // assignment, value to error + expected input{42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, error to error + expected input{cuda::std::unexpect, 42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // comparison with expected with value + expected lhs{42}; + expected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with expected with error + expected lhs{cuda::std::unexpect, 42}; + expected rhs{cuda::std::unexpect, 1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with type and value + expected expect{42}; + assert(expect == host_only_type{42}); + assert(host_only_type{42} == expect); + assert(expect != host_only_type{1337}); + assert(host_only_type{1337} != expect); + } + + { // comparison with type and error + expected expect{cuda::std::unexpect, 42}; + assert(expect == cuda::std::unexpected{42}); + assert(cuda::std::unexpected{42} == expect); + assert(expect != cuda::std::unexpected{1337}); + assert(cuda::std::unexpected{1337} != expect); + } + + { // swap + expected lhs{42}; + expected rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } + + { // swap cross error + expected lhs{42}; + expected rhs{cuda::std::unexpect, 1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp new file mode 100644 index 00000000000..766b6ae821c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// We cannot suppress execution checks in cuda::std::construct_at +// XFAIL: c++20 && !nvrtc && nvcc && !msvc +// UNSUPPORTED: clang-14 + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using optional = cuda::std::optional; + { // default construction + optional default_constructed{}; + assert(!default_constructed.has_value()); + } + + { // in_place zero initialization + optional in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + optional in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // value initialization + optional value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + optional input{42}; + optional dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + optional input{42}; + optional dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + optional input{42}; + optional dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + optional input{42}; + optional dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + optional input{}; + optional dest{1337}; + dest = input; + assert(!dest.has_value()); + } + + { // assignment, empty to empty + optional input{}; + optional dest{}; + dest = input; + assert(!dest.has_value()); + } + + { // comparison with optional + optional lhs{42}; + optional rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // comparison with type + optional opt{42}; + assert(opt == device_only_type{42}); + assert(device_only_type{42} == opt); + assert(opt != device_only_type{1337}); + assert(device_only_type{1337} != opt); + + assert(opt < device_only_type{1337}); + assert(device_only_type{7} < opt); + assert(opt <= device_only_type{1337}); + assert(device_only_type{7} <= opt); + + assert(opt > device_only_type{7}); + assert(device_only_type{1337} > opt); + assert(opt >= device_only_type{7}); + assert(device_only_type{1337} >= opt); + } + + { // swap + optional lhs{42}; + optional rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp new file mode 100644 index 00000000000..3bf26d0fb2e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using optional = cuda::std::optional; + { // default construction + optional default_constructed{}; + assert(!default_constructed.has_value()); + } + + { // in_place zero initialization + optional in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + optional in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // value initialization + optional value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + optional input{42}; + optional dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + optional input{42}; + optional dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + optional input{42}; + optional dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + optional input{42}; + optional dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + optional input{}; + optional dest{1337}; + dest = input; + assert(!dest.has_value()); + } + + { // assignment, empty to empty + optional input{}; + optional dest{}; + dest = input; + assert(!dest.has_value()); + } + + { // comparison with optional + optional lhs{42}; + optional rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // comparison with type + optional opt{42}; + assert(opt == host_only_type{42}); + assert(host_only_type{42} == opt); + assert(opt != host_only_type{1337}); + assert(host_only_type{1337} != opt); + + assert(opt < host_only_type{1337}); + assert(host_only_type{7} < opt); + assert(opt <= host_only_type{1337}); + assert(host_only_type{7} <= opt); + + assert(opt > host_only_type{7}); + assert(host_only_type{1337} > opt); + assert(opt >= host_only_type{7}); + assert(host_only_type{1337} >= opt); + } + + { // swap + optional lhs{42}; + optional rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp new file mode 100644 index 00000000000..d8820409d10 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using tuple = cuda::std::tuple; + { // default construction + tuple default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + tuple value_initialization{device_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + tuple value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // copy construction + tuple input{42}; + tuple dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + tuple input{42}; + tuple dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + tuple input{42}; + tuple dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // comparison with tuple + tuple lhs{42}; + tuple rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + tuple lhs{42}; + tuple rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp new file mode 100644 index 00000000000..4942d051b1c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using tuple = cuda::std::tuple; + { // default construction + tuple default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + tuple value_initialization{host_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + tuple value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // copy construction + tuple input{42}; + tuple dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + tuple input{42}; + tuple dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + tuple input{42}; + tuple dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to empty + tuple input{42}; + tuple dest{}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // comparison with tuple + tuple lhs{42}; + tuple rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + tuple lhs{42}; + tuple rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp new file mode 100644 index 00000000000..f36e86c2c3f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using unexpected = cuda::std::unexpected; + { // in_place zero initialization + unexpected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.error() == 0); + } + + { // in_place initialization + unexpected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.error() == 42); + } + + { // value initialization + unexpected value_initialization{42}; + assert(value_initialization.error() == 42); + } + + { // initializer_list initialization + unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.error() == 42); + } + + { // copy construction + unexpected input{42}; + unexpected dest{input}; + assert(dest.error() == 42); + } + + { // move construction + unexpected input{42}; + unexpected dest{cuda::std::move(input)}; + assert(dest.error() == 42); + } + + { // assignment + unexpected input{42}; + unexpected dest{1337}; + dest = input; + assert(dest.error() == 42); + } + + { // comparison with unexpected + unexpected lhs{42}; + unexpected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // swap + unexpected lhs{42}; + unexpected rhs{1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(rhs.error() == 42); + + swap(lhs, rhs); + assert(lhs.error() == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp new file mode 100644 index 00000000000..ca12494418c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using unexpected = cuda::std::unexpected; + { // in_place zero initialization + unexpected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.error() == 0); + } + + { // in_place initialization + unexpected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.error() == 42); + } + + { // value initialization + unexpected value_initialization{42}; + assert(value_initialization.error() == 42); + } + + { // initializer_list initialization + unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.error() == 42); + } + + { // copy construction + unexpected input{42}; + unexpected dest{input}; + assert(dest.error() == 42); + } + + { // move construction + unexpected input{42}; + unexpected dest{cuda::std::move(input)}; + assert(dest.error() == 42); + } + + { // assignment + unexpected input{42}; + unexpected dest{1337}; + dest = input; + assert(dest.error() == 42); + } + + { // comparison with unexpected + unexpected lhs{42}; + unexpected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // swap + unexpected lhs{42}; + unexpected rhs{1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(rhs.error() == 42); + + swap(lhs, rhs); + assert(lhs.error() == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp new file mode 100644 index 00000000000..aebdd6e12ea --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using pair = cuda::std::pair; + { // default construction + pair default_constructed{}; + assert(default_constructed.first == 0); + assert(default_constructed.second == 0); + } + + { // value initialization + pair value_initialization{device_only_type{42}, device_only_type{1337}}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // value initialization + pair value_initialization{42, 1337}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // copy construction + pair input{42, 1337}; + pair dest{input}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // move construction + pair input{42, 1337}; + pair dest{cuda::std::move(input)}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // assignment, value to value + pair input{42, 1337}; + pair dest{1337, 42}; + dest = input; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // comparison with pair + pair lhs{42, 1337}; + pair rhs{1337, 42}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + pair lhs{42, 1337}; + pair rhs{1337, 42}; + lhs.swap(rhs); + assert(lhs.first == 1337); + assert(lhs.second == 42); + assert(rhs.first == 42); + assert(rhs.second == 1337); + + swap(lhs, rhs); + assert(lhs.first == 42); + assert(lhs.second == 1337); + assert(rhs.first == 1337); + assert(rhs.second == 42); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp new file mode 100644 index 00000000000..cf1195f204d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using pair = cuda::std::pair; + { // default construction + pair default_constructed{}; + assert(default_constructed.first == 0); + assert(default_constructed.second == 0); + } + + { // value initialization + pair value_initialization{host_only_type{42}, host_only_type{1337}}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // value initialization + pair value_initialization{42, 1337}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // copy construction + pair input{42, 1337}; + pair dest{input}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // move construction + pair input{42, 1337}; + pair dest{cuda::std::move(input)}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // assignment, value to value + pair input{42, 1337}; + pair dest{1337, 42}; + dest = input; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // comparison with pair + pair lhs{42, 1337}; + pair rhs{1337, 42}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + pair lhs{42, 1337}; + pair rhs{1337, 42}; + lhs.swap(rhs); + assert(lhs.first == 1337); + assert(lhs.second == 42); + assert(rhs.first == 42); + assert(rhs.second == 1337); + + swap(lhs, rhs); + assert(lhs.first == 42); + assert(lhs.second == 1337); + assert(rhs.first == 1337); + assert(rhs.second == 42); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp new file mode 100644 index 00000000000..38ee416a8fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp @@ -0,0 +1,120 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using variant = cuda::std::variant; + { // default construction + variant default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + variant value_initialization{device_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + variant value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // in_place_type_t initialization + variant in_place_initialization{cuda::std::in_place_type_t{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_index_t initialization + variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{ + cuda::std::in_place_type_t{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // copy construction + variant input{42}; + variant dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + variant input{42}; + variant dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + variant input{42}; + variant dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // emplace + variant var{42}; + var.emplace(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace + variant var{42}; + var.emplace<0>(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace init list + variant var{42}; + var.emplace(cuda::std::initializer_list{}, 42); + assert(cuda::std::get<0>(var) == 42); + } + + { // comparison with variant + variant lhs{42}; + variant rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + variant lhs{42}; + variant rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp new file mode 100644 index 00000000000..5f12da6074b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using variant = cuda::std::variant; + { // default construction + variant default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + variant value_initialization{host_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + variant value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // in_place_type_t initialization + variant in_place_initialization{cuda::std::in_place_type_t{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_index_t initialization + variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{ + cuda::std::in_place_type_t{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // copy construction + variant input{42}; + variant dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + variant input{42}; + variant dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + variant input{42}; + variant dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to empty + variant input{42}; + variant dest{}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // emplace + variant var{42}; + var.emplace(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace + variant var{42}; + var.emplace<0>(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace init list + variant var{42}; + var.emplace(cuda::std::initializer_list{}, 42); + assert(cuda::std::get<0>(var) == 42); + } + + { // comparison with variant + variant lhs{42}; + variant rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + variant lhs{42}; + variant rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/support/host_device_types.h b/libcudacxx/test/support/host_device_types.h new file mode 100644 index 00000000000..e8fa21b85b9 --- /dev/null +++ b/libcudacxx/test/support/host_device_types.h @@ -0,0 +1,148 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_SUPPORT_HOST_DEVICE_TYPES +#define TEST_SUPPORT_HOST_DEVICE_TYPES + +#include +#include + +#if !_CCCL_COMPILER(NVRTC) +struct host_only_type +{ + int val_; + + host_only_type(const int val = 0) noexcept + : val_(val) + {} + host_only_type(cuda::std::initializer_list, const int val) noexcept + : val_(val) + {} + + host_only_type(const host_only_type& other) noexcept + : val_(other.val_) + {} + host_only_type(host_only_type&& other) noexcept + : val_(cuda::std::exchange(other.val_, -1)) + {} + + host_only_type& operator=(const host_only_type& other) noexcept + { + val_ = other.val_; + return *this; + } + + host_only_type& operator=(host_only_type&& other) noexcept + + { + val_ = cuda::std::exchange(other.val_, -1); + return *this; + } + + ~host_only_type() noexcept {} + + _CCCL_NODISCARD_FRIEND bool operator==(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ == rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator!=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ != rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator<(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ < rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator<=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ <= rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator>(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ > rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator>=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ >= rhs.val_; + } + + void swap(host_only_type& other) noexcept + { + cuda::std::swap(val_, other.val_); + } +}; +#endif // !_CCCL_COMPILER(NVRTC) + +#if _CCCL_HAS_CUDA_COMPILER +struct device_only_type +{ + int val_; + + __device__ device_only_type(const int val = 0) noexcept + : val_(val) + {} + __device__ device_only_type(cuda::std::initializer_list, const int val) noexcept + : val_(val) + {} + + __device__ device_only_type(const device_only_type& other) noexcept + : val_(other.val_) + {} + __device__ device_only_type(device_only_type&& other) noexcept + : val_(cuda::std::exchange(other.val_, -1)) + {} + + __device__ device_only_type& operator=(const device_only_type& other) noexcept + { + val_ = other.val_; + return *this; + } + + __device__ device_only_type& operator=(device_only_type&& other) noexcept + + { + val_ = cuda::std::exchange(other.val_, -1); + return *this; + } + + __device__ ~device_only_type() noexcept {} + + __device__ _CCCL_NODISCARD_FRIEND bool operator==(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ == rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator!=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ != rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator<(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ < rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator<=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ <= rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator>(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ > rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator>=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ >= rhs.val_; + } + + __device__ void swap(device_only_type& other) noexcept + { + cuda::std::swap(val_, other.val_); + } +}; +#endif // _CCCL_HAS_CUDA_COMPILER + +#endif // TEST_SUPPORT_HOST_DEVICE_TYPES From 8615f321e6305a1dbbd72b8050c47e4e6b27790f Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 30 Jan 2025 00:09:17 -0800 Subject: [PATCH 05/15] [nv/target] Add sm_120 macros. (#3550) Co-authored-by: Bernhard Manfred Gruber --- libcudacxx/include/nv/detail/__target_macros | 21 ++++++++++++++++++++ libcudacxx/include/nv/target | 9 +++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/nv/detail/__target_macros b/libcudacxx/include/nv/detail/__target_macros index 85df652c7d4..2de10fc8ec4 100644 --- a/libcudacxx/include/nv/detail/__target_macros +++ b/libcudacxx/include/nv/detail/__target_macros @@ -35,6 +35,7 @@ #define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90 #define _NV_TARGET_ARCH_TO_SELECTOR_1000 nv::target::sm_100 #define _NV_TARGET_ARCH_TO_SELECTOR_1010 nv::target::sm_101 +#define _NV_TARGET_ARCH_TO_SELECTOR_1200 nv::target::sm_120 #define _NV_TARGET_ARCH_TO_SM_350 35 #define _NV_TARGET_ARCH_TO_SM_370 37 @@ -54,6 +55,7 @@ #define _NV_TARGET_ARCH_TO_SM_900 90 #define _NV_TARGET_ARCH_TO_SM_1000 100 #define _NV_TARGET_ARCH_TO_SM_1010 101 +#define _NV_TARGET_ARCH_TO_SM_1200 120 // Only enable when compiling for CUDA/stdpar #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA) @@ -76,6 +78,7 @@ # define _NV_TARGET_VAL_SM_90 nv::target::sm_90 # define _NV_TARGET_VAL_SM_100 nv::target::sm_100 # define _NV_TARGET_VAL_SM_101 nv::target::sm_101 +# define _NV_TARGET_VAL_SM_120 nv::target::sm_120 # define _NV_TARGET___NV_IS_HOST nv::target::is_host # define _NV_TARGET___NV_IS_DEVICE nv::target::is_device @@ -112,6 +115,7 @@ # define _NV_TARGET_VAL_SM_90 900 # define _NV_TARGET_VAL_SM_100 1000 # define _NV_TARGET_VAL_SM_101 1010 +# define _NV_TARGET_VAL_SM_120 1200 # if defined(__CUDA_ARCH__) # define _NV_TARGET_VAL __CUDA_ARCH__ @@ -160,6 +164,7 @@ # define _NV_TARGET_VAL_SM_90 900 # define _NV_TARGET_VAL_SM_100 1000 # define _NV_TARGET_VAL_SM_101 1010 +# define _NV_TARGET_VAL_SM_120 1200 # define _NV_TARGET_VAL 0 @@ -191,6 +196,7 @@ #define _NV_TARGET___NV_PROVIDES_SM_90 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_90)) #define _NV_TARGET___NV_PROVIDES_SM_100 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_100)) #define _NV_TARGET___NV_PROVIDES_SM_101 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_101)) +#define _NV_TARGET___NV_PROVIDES_SM_120 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_120)) #define _NV_TARGET___NV_IS_EXACTLY_SM_35 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_35)) #define _NV_TARGET___NV_IS_EXACTLY_SM_37 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_37)) @@ -210,6 +216,7 @@ #define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90)) #define _NV_TARGET___NV_IS_EXACTLY_SM_100 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_100)) #define _NV_TARGET___NV_IS_EXACTLY_SM_101 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_101)) +#define _NV_TARGET___NV_IS_EXACTLY_SM_120 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_120)) #define NV_PROVIDES_SM_35 __NV_PROVIDES_SM_35 #define NV_PROVIDES_SM_37 __NV_PROVIDES_SM_37 @@ -229,6 +236,7 @@ #define NV_PROVIDES_SM_90 __NV_PROVIDES_SM_90 #define NV_PROVIDES_SM_100 __NV_PROVIDES_SM_100 #define NV_PROVIDES_SM_101 __NV_PROVIDES_SM_101 +#define NV_PROVIDES_SM_120 __NV_PROVIDES_SM_120 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37 @@ -248,6 +256,7 @@ #define NV_IS_EXACTLY_SM_90 __NV_IS_EXACTLY_SM_90 #define NV_IS_EXACTLY_SM_100 __NV_IS_EXACTLY_SM_100 #define NV_IS_EXACTLY_SM_101 __NV_IS_EXACTLY_SM_101 +#define NV_IS_EXACTLY_SM_120 __NV_IS_EXACTLY_SM_120 // Disable SM_90a support on non-supporting compilers. // Will re-enable for nvcc below. @@ -381,6 +390,12 @@ # define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_101 0 # endif +# if (_NV_TARGET___NV_IS_EXACTLY_SM_120) +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 1 +# else +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 0 +# endif + // Re-enable sm_90a support in nvcc. # undef NV_HAS_FEATURE_SM_90a # define NV_HAS_FEATURE_SM_90a __NV_HAS_FEATURE_SM_90a @@ -529,6 +544,12 @@ # define _NV_TARGET_BOOL___NV_PROVIDES_SM_101 0 # endif +# if (_NV_TARGET___NV_PROVIDES_SM_120) +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 1 +# else +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 0 +# endif + # define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond # define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond)) diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target index d8617220c84..4b77011243f 100644 --- a/libcudacxx/include/nv/target +++ b/libcudacxx/include/nv/target @@ -68,9 +68,10 @@ constexpr base_int_t sm_89_bit = 1 << 15; constexpr base_int_t sm_90_bit = 1 << 16; constexpr base_int_t sm_100_bit = 1 << 17; constexpr base_int_t sm_101_bit = 1 << 18; +constexpr base_int_t sm_120_bit = 1 << 19; constexpr base_int_t all_devices = sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit - | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit; + | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit | sm_120_bit; // Store a set of targets as a set of bits struct _NV_BITSET_ATTRIBUTE target_description @@ -103,6 +104,7 @@ enum class sm_selector : base_int_t sm_90 = 90, sm_100 = 100, sm_101 = 101, + sm_120 = 120, }; constexpr base_int_t toint(sm_selector a) @@ -130,12 +132,14 @@ constexpr base_int_t bitexact(sm_selector a) : toint(a) == 90 ? sm_90_bit : toint(a) == 100 ? sm_100_bit : toint(a) == 101 ? sm_101_bit + : toint(a) == 120 ? sm_120_bit : 0; } constexpr base_int_t bitrounddown(sm_selector a) { - return toint(a) >= 101 ? sm_101_bit + return toint(a) >= 120 ? sm_120_bit + : toint(a) >= 101 ? sm_101_bit : toint(a) >= 100 ? sm_100_bit : toint(a) >= 90 ? sm_90_bit : toint(a) >= 89 ? sm_89_bit @@ -214,6 +218,7 @@ constexpr sm_selector sm_89 = sm_selector::sm_89; constexpr sm_selector sm_90 = sm_selector::sm_90; constexpr sm_selector sm_100 = sm_selector::sm_100; constexpr sm_selector sm_101 = sm_selector::sm_101; +constexpr sm_selector sm_120 = sm_selector::sm_120; using detail::is_exactly; using detail::provides; From 3e888d8fd7953d595af016eacd89af610fb624e6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 09:10:00 +0100 Subject: [PATCH 06/15] PTX: Remove internal instructions (#3583) * barrier.cluster.aligned: Remove This is not supposed to be exposed in CCCL. * elect.sync: Remove Not ready for inclusion yet. This needs to handle the optional extra output mask as well. * mapa: Remove This has compiler bugs. We should use intrinsics instead. Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 --------- .../ptx/instructions/generated/elect_sync.rst | 11 -- .../ptx/instructions/generated/mapa.rst | 14 -- .../generated/barrier_cluster_aligned.h | 130 ------------------ .../__ptx/instructions/generated/elect_sync.h | 36 ----- .../cuda/__ptx/instructions/generated/mapa.h | 33 ----- .../ptx/generated/barrier_cluster_aligned.h | 61 -------- .../cuda/ptx/generated/elect_sync.h | 26 ---- .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 ---- 9 files changed, 401 deletions(-) delete mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst deleted file mode 100644 index a24093ac7b6..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst +++ /dev/null @@ -1,63 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -barrier.cluster.arrive.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.release.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .release } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.relaxed.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .relaxed } - // .aligned = { .aligned } - // Marked volatile - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.acquire.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .acquire } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst deleted file mode 100644 index bc909c54319..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -elect.sync -^^^^^^^^^^ -.. code:: cuda - - // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 - template - __device__ static inline bool elect_sync( - const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst deleted file mode 100644 index 4ffc70d85d9..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/mapa.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -mapa.shared::cluster.u32 -^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 - // .space = { .shared::cluster } - template - __device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h deleted file mode 100644 index 80fe3796e69..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,130 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ -#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ - -/* -// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .release } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) -{ -// __sem == sem_release (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .relaxed } -// .aligned = { .aligned } -// Marked volatile -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) -{ -// __sem == sem_relaxed (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .acquire } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) -{ -// __sem == sem_acquire (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h deleted file mode 100644 index e8691178f14..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h +++ /dev/null @@ -1,36 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ -#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ - -/* -// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 -template -__device__ static inline bool elect_sync( - const uint32_t& membermask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) -{ -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __is_elected; - asm volatile( - "{\n\t .reg .pred P_OUT; \n\t" - "elect.sync _|P_OUT, %1;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__is_elected) - : "r"(__membermask) - :); - return static_cast(__is_elected); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); - return false; -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h deleted file mode 100644 index f93c8a62157..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_MAPA_H_ -#define _CUDA_PTX_GENERATED_MAPA_H_ - -/* -// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 -// .space = { .shared::cluster } -template -__device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) -{ -// __space == space_cluster (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __dest; - asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); - return __from_ptr_dsmem<_Tp>(__dest); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mapa_is_not_supported_before_SM_90__(); - return __from_ptr_dsmem<_Tp>(0); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h deleted file mode 100644 index 6f5a022dbc8..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,61 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_barrier_cluster_aligned(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.release.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.relaxed.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.acquire.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h deleted file mode 100644 index 298225881d1..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h +++ /dev/null @@ -1,26 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_elect_sync(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // elect.sync _|is_elected, membermask; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h deleted file mode 100644 index 9160be1fe2d..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h +++ /dev/null @@ -1,27 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_mapa(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mapa.shared::cluster.u32 dest, addr, target_cta; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mapa));)); -#endif // __cccl_ptx_isa >= 780 -} From 15a011658172b1b63bfac8a96fb49fec6d6af92a Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 30 Jan 2025 03:03:11 -0600 Subject: [PATCH 07/15] Add dynamic CUB dispatch for merge_sort (#3525) * Add `dependent_launch` parameter to `TripleChevronFactory` * Add `ItemsPerTile()` method to `PolicyWrapper` * Add `MergeSortPolicyWrapper` * Add `KernelSource` and use `launcher_factory` to launch `merge_sort` kernels * Move the vsmem_helper to kernel source and read `BlockThreads` from there instead of the policy directly * Make `BlockThreads` templated on the policy type * Obtain `ItemsPerTile` from the kernel source through vsmem helper * Change vsmem indirection so that it is its own template parameter passed to `DispatchMergeSort` * Use `_CCCL_HOST_DEVICE` for RTC --- cub/cub/detail/launcher/cuda_runtime.cuh | 6 +- .../device/dispatch/dispatch_merge_sort.cuh | 214 +++++++++++------- .../device/dispatch/kernels/merge_sort.cuh | 22 ++ .../dispatch/tuning/tuning_merge_sort.cuh | 32 ++- cub/cub/util_device.cuh | 5 + 5 files changed, 189 insertions(+), 90 deletions(-) diff --git a/cub/cub/detail/launcher/cuda_runtime.cuh b/cub/cub/detail/launcher/cuda_runtime.cuh index 81ef450f424..f59c26d7fbb 100644 --- a/cub/cub/detail/launcher/cuda_runtime.cuh +++ b/cub/cub/detail/launcher/cuda_runtime.cuh @@ -21,10 +21,10 @@ namespace detail struct TripleChevronFactory { - CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron - operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const + CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron operator()( + dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream, bool dependent_launch = false) const { - return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream); + return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream, dependent_launch); } CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version) diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 056522e162d..98a4b40e8f8 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -46,7 +46,6 @@ #include #include -#include #include #include @@ -54,24 +53,89 @@ CUB_NAMESPACE_BEGIN -/******************************************************************************* - * Policy - ******************************************************************************/ - -template > + typename CompareOpT> +struct DeviceMergeSortKernelSource +{ + using KeyT = cub::detail::value_t; + using ValueT = cub::detail::value_t; + + CUB_DEFINE_KERNEL_GETTER( + MergeSortBlockSortKernel, + DeviceMergeSortBlockSortKernel< + MaxPolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>); + + CUB_DEFINE_KERNEL_GETTER(MergeSortPartitionKernel, + DeviceMergeSortPartitionKernel); + + CUB_DEFINE_KERNEL_GETTER( + MergeSortMergeKernel, + DeviceMergeSortMergeKernel); +}; + +} // namespace detail::merge_sort + +/******************************************************************************* + * Policy + ******************************************************************************/ + +template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyIteratorT, + typename ValueIteratorT, + typename OffsetT, + typename CompareOpT, + typename PolicyHub = detail::merge_sort::policy_hub, + typename KernelSource = detail::merge_sort::DeviceMergeSortKernelSource< + typename PolicyHub::MaxPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT>, + typename KernelLauncherFactory = detail::TripleChevronFactory, + typename VSMemHelperPolicyT = detail::merge_sort::merge_sort_vsmem_helper_t< + typename PolicyHub::MaxPolicy::MergeSortPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + cub::detail::value_t, + cub::detail::value_t>> struct DispatchMergeSort { using KeyT = cub::detail::value_t; using ValueT = cub::detail::value_t; /// Whether or not there are values to be trucked along with keys - static constexpr bool KEYS_ONLY = std::is_same::value; + static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v; // Problem state @@ -106,6 +170,12 @@ struct DispatchMergeSort int ptx_version; + KernelSource kernel_source; + + KernelLauncherFactory launcher_factory; + + VSMemHelperPolicyT vsmem_helper; + // Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort( void* d_temp_storage, @@ -117,7 +187,10 @@ struct DispatchMergeSort OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, - int ptx_version) + int ptx_version, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + VSMemHelperPolicyT vsmem_helper = {}) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) @@ -128,28 +201,15 @@ struct DispatchMergeSort , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) + , kernel_source(kernel_source) + , launcher_factory(launcher_factory) + , vsmem_helper(vsmem_helper) {} // Invocation template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {}) { - using MergePolicyT = typename ActivePolicyT::MergeSortPolicy; - - using merge_sort_helper_t = detail::merge_sort::merge_sort_vsmem_helper_t< - MergePolicyT, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - - using BlockSortVSmemHelperT = detail::vsmem_helper_impl; - using MergeAgentVSmemHelperT = detail::vsmem_helper_impl; - cudaError error = cudaSuccess; if (num_items == 0) @@ -163,8 +223,9 @@ struct DispatchMergeSort do { - constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE; - const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); + auto wrapped_policy = detail::merge_sort::MakeMergeSortPolicyWrapper(policy); + const auto tile_size = vsmem_helper.ItemsPerTile(wrapped_policy.MergeSort()); + const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); @@ -174,8 +235,8 @@ struct DispatchMergeSort * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases, * merge sort allocates virtual shared memory that resides in global memory. */ - const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; - const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; + const std::size_t block_sort_smem_size = num_tiles * vsmem_helper.block_sort_vsmem_per_block(); + const std::size_t merge_smem_size = num_tiles * vsmem_helper.merge_vsmem_per_block(); const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; @@ -214,29 +275,19 @@ struct DispatchMergeSort auto items_buffer = static_cast(allocations[2]); // Invoke DeviceMergeSortBlockSortKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true) - .doit( - detail::merge_sort::DeviceMergeSortBlockSortKernel< - typename PolicyHub::MaxPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>, - ping, - d_input_keys, - d_input_items, - d_output_keys, - d_output_items, - num_items, - keys_buffer, - items_buffer, - compare_op, - cub::detail::vsmem_t{allocations[3]}); + launcher_factory( + static_cast(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true) + .doit(kernel_source.MergeSortBlockSortKernel(), + ping, + d_input_keys, + d_input_items, + d_output_keys, + d_output_items, + num_items, + keys_buffer, + items_buffer, + compare_op, + cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) @@ -273,9 +324,8 @@ struct DispatchMergeSort const OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - partition_grid_size, threads_per_partition_block, 0, stream, true) - .doit(detail::merge_sort::DeviceMergeSortPartitionKernel, + launcher_factory(partition_grid_size, threads_per_partition_block, 0, stream, true) + .doit(kernel_source.MergeSortPartitionKernel(), ping, d_output_keys, keys_buffer, @@ -300,29 +350,19 @@ struct DispatchMergeSort } // Merge - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true) - .doit( - detail::merge_sort::DeviceMergeSortMergeKernel< - typename PolicyHub::MaxPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>, - ping, - d_output_keys, - d_output_items, - num_items, - keys_buffer, - items_buffer, - compare_op, - merge_partitions, - target_merged_tiles_number, - cub::detail::vsmem_t{allocations[3]}); + launcher_factory( + static_cast(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true) + .doit(kernel_source.MergeSortMergeKernel(), + ping, + d_output_keys, + d_output_items, + num_items, + keys_buffer, + items_buffer, + compare_op, + merge_partitions, + target_merged_tiles_number, + cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) @@ -342,6 +382,7 @@ struct DispatchMergeSort return error; } + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -351,7 +392,11 @@ struct DispatchMergeSort ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, - cudaStream_t stream) + cudaStream_t stream, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + MaxPolicyT max_policy = {}, + VSMemHelperPolicyT vsmem_helper = {}) { cudaError error = cudaSuccess; do @@ -375,10 +420,13 @@ struct DispatchMergeSort num_items, compare_op, stream, - ptx_version); + ptx_version, + kernel_source, + launcher_factory, + vsmem_helper); // Dispatch to chained policy - error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); + error = CubDebug(max_policy.Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh index 1065313c20d..c9a8a61395a 100644 --- a/cub/cub/device/dispatch/kernels/merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -116,6 +116,28 @@ public: using block_sort_agent_t = ::cuda::std::_If; using merge_agent_t = ::cuda::std::_If; + + _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t block_sort_vsmem_per_block() + { + return detail::vsmem_helper_impl::vsmem_per_block; + } + + _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t merge_vsmem_per_block() + { + return detail::vsmem_helper_impl::vsmem_per_block; + } + + template + _CCCL_HOST_DEVICE static constexpr int BlockThreads(PolicyT /*policy*/) + { + return policy_t::BLOCK_THREADS; + } + + template + _CCCL_HOST_DEVICE static constexpr int ItemsPerTile(PolicyT /*policy*/) + { + return policy_t::ITEMS_PER_TILE; + } }; template +struct MergeSortPolicyWrapper : PolicyT +{ + CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(PolicyT base) + : PolicyT(base) + {} +}; + +template +struct MergeSortPolicyWrapper> + : StaticPolicyT +{ + CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(StaticPolicyT base) + : StaticPolicyT(base) + {} + + CUB_DEFINE_SUB_POLICY_GETTER(MergeSort); +}; + +template +CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper MakeMergeSortPolicyWrapper(PolicyT policy) { + return MergeSortPolicyWrapper{policy}; +} + template struct policy_hub { @@ -88,8 +112,8 @@ struct policy_hub using MaxPolicy = Policy600; }; -} // namespace merge_sort -} // namespace detail + +} // namespace detail::merge_sort template using DeviceMergeSortPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will be " diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index fd356b8f9e5..ca365b531fc 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -553,6 +553,11 @@ struct PolicyWrapper< { return StaticPolicyT::ITEMS_PER_THREAD; } + + CUB_RUNTIME_FUNCTION static constexpr int ItemsPerTile() + { + return StaticPolicyT::ITEMS_PER_TILE; + } }; template From 5ce5d28f0572d34126e00f0765977d8c54391e8e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 10:52:13 +0100 Subject: [PATCH 08/15] PTX: Update existing instructions (#3584) * mbarrier.expect_tx: Add missing source and test It was already documented(!) * cp.async.bulk.tensor: Add .{gather,scatter}4 * fence: Add .sync_restrict, .proxy.async.sync_restrict Co-authored-by: Allard Hendriksen --- .../ptx/instructions/cp_async_bulk_tensor.rst | 5 +++ docs/libcudacxx/ptx/instructions/fence.rst | 10 +++++ .../__ptx/instructions/cp_async_bulk_tensor.h | 1 + .../include/cuda/__ptx/instructions/fence.h | 2 + .../__ptx/instructions/mbarrier_expect_tx.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 1 + .../cuda/ptx/ptx.fence.compile.pass.cpp | 2 + .../ptx.mbarrier.expect_tx.compile.pass.cpp | 22 +++++++++++ 9 files changed, 81 insertions(+) create mode 100644 libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst index bde3488bac9..8dc9def989b 100644 --- a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst @@ -21,3 +21,8 @@ Multicast --------- .. include:: generated/cp_async_bulk_tensor_multicast.rst + +Scatter / Gather +---------------- + +.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst index 82de170f63b..4d9126be62f 100644 --- a/docs/libcudacxx/ptx/instructions/fence.rst +++ b/docs/libcudacxx/ptx/instructions/fence.rst @@ -13,6 +13,11 @@ fence .. include:: generated/fence.rst +fence.sync_restrict +------------------- + +.. include:: generated/fence_sync_restrict.rst + fence.mbarrier_init ------------------- @@ -29,6 +34,11 @@ fence.proxy.async .. include:: generated/fence_proxy_async.rst +fence.proxy.async.sync_restrict +------------------------------- + +.. include:: generated/fence_proxy_async_generic_sync_restrict.rst + fence.proxy.tensormap --------------------- diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 7de5b41b744..f99c0c6f73b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor #include +#include #include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index a8dccf979c2..3c123840797 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX #include #include #include +#include #include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h new file mode 100644 index 00000000000..886bfe64d75 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 44edb20c98e..4798973df77 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 42bc5b8e355..efd66a8fa4e 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -17,6 +17,7 @@ #include "nvrtc_workaround.h" // above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor.h" +#include "generated/cp_async_bulk_tensor_gather_scatter.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index c439720b8f8..aa2c9ec6152 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -20,7 +20,9 @@ #include "generated/fence_mbarrier_init.h" #include "generated/fence_proxy_alias.h" #include "generated/fence_proxy_async.h" +#include "generated/fence_proxy_async_generic_sync_restrict.h" #include "generated/fence_proxy_tensormap_generic.h" +#include "generated/fence_sync_restrict.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp new file mode 100644 index 00000000000..f4d06bdb8ba --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/mbarrier_expect_tx.h" + +int main(int, char**) +{ + return 0; +} From a1a73a8708eac531498762c22999d0a5aea076d0 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 11:26:03 +0100 Subject: [PATCH 09/15] Internalize cuda/detail/core/util.h (#3505) --- cub/cub/agent/agent_adjacent_difference.cuh | 4 +- cub/cub/agent/agent_merge.cuh | 8 +- cub/cub/agent/agent_merge_sort.cuh | 15 ++-- cub/cub/agent/agent_sub_warp_merge_sort.cuh | 4 +- cub/cub/device/dispatch/dispatch_merge.cuh | 2 +- .../device/dispatch/kernels/merge_sort.cuh | 19 ++--- .../system/cuda/detail/core/agent_launcher.h | 46 +++--------- .../system/cuda/detail/core/load_iterator.h | 4 +- .../cuda/detail/core/make_load_iterator.h | 4 +- thrust/thrust/system/cuda/detail/core/util.h | 51 +------------ thrust/thrust/system/cuda/detail/extrema.h | 18 ++--- thrust/thrust/system/cuda/detail/reduce.h | 40 +++++----- .../thrust/system/cuda/detail/reduce_by_key.h | 48 ++++++------ .../system/cuda/detail/set_operations.h | 73 +++++++++---------- thrust/thrust/system/cuda/detail/sort.h | 6 +- thrust/thrust/system/cuda/detail/unique.h | 43 +++++------ 16 files changed, 156 insertions(+), 229 deletions(-) diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index c19cb90079a..8617c78193b 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -79,7 +79,7 @@ template struct AgentDifference { - using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; @@ -119,7 +119,7 @@ struct AgentDifference OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) - , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) + , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index 9ae14c3e42e..5c7d5322456 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -64,10 +64,10 @@ struct agent_t using key_type = typename ::cuda::std::iterator_traits::value_type; using item_type = typename ::cuda::std::iterator_traits::value_type; - using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using block_load_keys1 = typename BlockLoadType::type; using block_load_keys2 = typename BlockLoadType::type; diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index bf4984f7256..1ec952187a7 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -91,8 +91,10 @@ struct AgentBlockSort using BlockMergeSortT = BlockMergeSort; - using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoadKeys = typename cub::BlockLoadType::type; using BlockLoadItems = typename cub::BlockLoadType::type; @@ -438,10 +440,11 @@ struct AgentMerge //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- - using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPingIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using KeysOutputPongIt = KeyIteratorT; using ItemsOutputPongIt = ValueIteratorT; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index b10f1cda3ea..9f98ac42e3b 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -183,8 +183,8 @@ public: using WarpMergeSortT = WarpMergeSort; - using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using WarpLoadKeysT = cub::WarpLoad; using WarpLoadItemsT = diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index b3d0c8ab2ca..c4df61fd29a 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -138,7 +138,7 @@ __launch_bounds__( CompareOp>::type; using MergePolicy = typename MergeAgent::policy; - using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator; + using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator; using vsmem_helper_t = vsmem_helper_impl; __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage; auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage); diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh index c9a8a61395a..79f7c6bbe40 100644 --- a/cub/cub/device/dispatch/kernels/merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -19,12 +19,13 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it); -} // namespace cuda_cub::core +typename detail::LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE +make_load_iterator(PtxPlan const&, It it); +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END @@ -196,8 +197,8 @@ __launch_bounds__( AgentBlockSortT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in), keys_count, keys_out, items_out, @@ -302,10 +303,10 @@ __launch_bounds__( AgentMergeT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong), keys_count, keys_pong, items_pong, diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index fb7c1ef22d6..d9baeb47593 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -62,7 +62,8 @@ namespace cuda_cub { namespace core { - +namespace detail +{ # ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES # define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES # endif @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args) template struct AgentLauncher : Agent { - core::AgentPlan plan; + AgentPlan plan; size_t count; cudaStream_t stream; char const* name; @@ -121,7 +122,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -136,7 +137,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -149,7 +150,7 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); @@ -162,43 +163,19 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); } -# if 0 - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0) - { - // in separable compilation mode, we have no choice - // but to call kernel to get agent_plan - // otherwise the risk is something may fail - // if user mix & match ptx versions in a separably compiled function - // http://nvbugs/1772071 - // XXX may be it is too string of a requirements, consider relaxing it in - // the future -# ifdef __CUDACC_RDC__ - return core::get_agent_plan(s, d_ptr); -# else - return get_agent_plan(core::get_ptx_version()); -# endif - } - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan_default() - { - return get_agent_plan(sm_arch<0>::type::ver); - } -# endif - - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) + THRUST_RUNTIME_FUNCTION typename get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) { THRUST_UNUSED_VAR(d_ptr); - return get_agent_plan(core::get_ptx_version()); + return get_agent_plan(get_ptx_version()); } - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan() + THRUST_RUNTIME_FUNCTION typename detail::get_plan::type static get_plan() { return get_agent_plan(lowest_supported_sm_arch::ver); } @@ -227,7 +204,7 @@ struct AgentLauncher : Agent { # if THRUST_DEBUG_SYNC_FLAG cuda_optional occ = max_sm_occupancy(k); - const int ptx_version = core::get_ptx_version(); + const int ptx_version = get_ptx_version(); if (count > 0) { _CubLog( @@ -305,6 +282,7 @@ struct AgentLauncher : Agent } }; +} // namespace detail } // namespace core } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h index 07c5eba0eaa..6f2c118b151 100644 --- a/thrust/thrust/system/cuda/detail/core/load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h @@ -34,7 +34,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // LoadIterator @@ -52,6 +52,6 @@ struct LoadIterator cub::CacheModifiedInputIterator, It>; }; // struct Iterator -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h index 28c65c813ea..9497ccacca9 100644 --- a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h @@ -33,7 +33,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { template typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE @@ -55,6 +55,6 @@ typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa return make_load_iterator_impl(it, typename is_contiguous_iterator::type()); } -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 94a7e750aeb..b3bdcf1f086 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -78,6 +78,8 @@ namespace core # endif #endif +namespace detail +{ /// Typelist - a container of types template struct typelist; @@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block() return static_cast(i32value); } -THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block) -{ - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); - if (shmem_per_block > max_shmem_per_block) - { - return shmem_per_block; - } - else - { - return 0; - } -} - THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks) { - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); + size_t max_shmem_per_block = get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) { return shmem_per_block * num_blocks; @@ -509,22 +498,6 @@ struct BlockLoad get_arch::type::ver>; }; -// BlockStore -// ----------- -// a helper metaprogram that returns type of a block loader -template ::value_type> -struct BlockStore -{ - using type = - cub::BlockStore::type::ver>; -}; - // cuda_optional // -------------- // used for function that return cudaError_t along with the result @@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version() return ptx_version; } -THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream) -{ - return cub::SyncStream(stream); -} - -inline void _CCCL_DEVICE sync_threadblock() -{ - __syncthreads(); -} - // Deprecated [Since 2.8] #define CUDA_CUB_RET_IF_FAIL(e) \ { \ @@ -719,11 +682,6 @@ struct uninitialized_array } }; -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align) -{ - return ((n + align - 1) / align) * align; -} - namespace host { inline cuda_optional get_max_shared_memory_per_block() @@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes); } +} // namespace detail } // namespace core -using core::sm52; -using core::sm60; } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index 617eb8bbc79..b2124323424 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -184,10 +184,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -204,7 +204,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -221,7 +221,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -245,7 +245,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -331,14 +331,14 @@ extrema(execution_policy& policy, InputIt first, Size num_items, Binary void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 3787ab62367..61ec2086adf 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -109,7 +109,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -155,7 +155,7 @@ struct ReduceAgent using tuning = Tuning; using Vector = typename cub::CubVector; - using LoadIt = typename core::LoadIterator::type; + using LoadIt = typename core::detail::LoadIterator::type; using BlockReduce = cub::BlockReduce; using VectorLoadIt = cub::CacheModifiedInputIterator; @@ -175,7 +175,7 @@ struct ReduceAgent // Other algorithms, e.g. merge, may not need additional information, // and may use AgentPlan directly, instead of defining their own Plan type. // - struct Plan : core::AgentPlan + struct Plan : core::detail::AgentPlan { cub::GridMappingStrategy grid_mapping; @@ -183,7 +183,7 @@ struct ReduceAgent template THRUST_RUNTIME_FUNCTION Plan(P) - : core::AgentPlan(P()) + : core::detail::AgentPlan(P()) , grid_mapping(P::GRID_MAPPING) {} }; @@ -192,7 +192,7 @@ struct ReduceAgent // ptx_plan type *must* only be used from device code // Its use from host code will result in *undefined behaviour* // - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using TempStorage = typename ptx_plan::TempStorage; using Vector = typename ptx_plan::Vector; @@ -230,7 +230,7 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_) : storage(storage_) , input_it(input_it_) - , load_it(core::make_load_iterator(ptx_plan(), input_it)) + , load_it(core::detail::make_load_iterator(ptx_plan(), input_it)) , reduction_op(reduction_op_) {} @@ -428,8 +428,6 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION T consume_tiles_impl(Size num_items, cub::GridQueue queue, CAN_VECTORIZE can_vectorize) { - using core::sync_threadblock; - // We give each thread block at least one tile of input. T thread_aggregate; Size block_offset = blockIdx.x * ITEMS_PER_TILE; @@ -454,7 +452,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -465,7 +463,7 @@ struct ReduceAgent consume_tile( thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize); - sync_threadblock(); + __syncthreads(); // Dequeue a tile of items if (threadIdx.x == 0) @@ -473,7 +471,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -586,7 +584,7 @@ struct DrainAgent template struct PtxPlan : PtxPolicy<1> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -609,10 +607,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -629,7 +627,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -646,7 +644,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -670,7 +668,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -755,14 +753,14 @@ reduce(execution_policy& policy, InputIt first, Size num_items, T init, void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index ae1f0ffab96..8c1db436085 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -115,7 +115,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -163,11 +163,11 @@ struct ReduceByKeyAgent { using tuning = Tuning; - using KeysLoadIt = typename core::LoadIterator::type; - using ValuesLoadIt = typename core::LoadIterator::type; + using KeysLoadIt = typename core::detail::LoadIterator::type; + using ValuesLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadKeys = typename core::BlockLoad::type; - using BlockLoadValues = typename core::BlockLoad::type; + using BlockLoadKeys = typename core::detail::BlockLoad::type; + using BlockLoadValues = typename core::detail::BlockLoad::type; using BlockDiscontinuityKeys = cub::BlockDiscontinuity; @@ -188,11 +188,11 @@ struct ReduceByKeyAgent typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadValues::TempStorage load_values; - core::uninitialized_array raw_exchange; + core::detail::uninitialized_array raw_exchange; }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt = typename ptx_plan::KeysLoadIt; using ValuesLoadIt = typename ptx_plan::ValuesLoadIt; @@ -360,9 +360,7 @@ struct ReduceByKeyAgent size_type num_tile_segments, size_type num_tile_segments_prefix) { - using core::sync_threadblock; - - sync_threadblock(); + __syncthreads(); // Compact and scatter keys # pragma unroll @@ -375,7 +373,7 @@ struct ReduceByKeyAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { @@ -445,8 +443,6 @@ struct ReduceByKeyAgent template THRUST_DEVICE_FUNCTION void consume_first_tile(Size num_remaining, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -468,7 +464,7 @@ struct ReduceByKeyAgent BlockLoadKeys(storage.load_keys).Load(keys_load_it + tile_offset, keys); } - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -481,7 +477,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags. // First tile sets the first flag for the first item @@ -540,8 +536,6 @@ struct ReduceByKeyAgent THRUST_DEVICE_FUNCTION void consume_subsequent_tile(Size num_remaining, int tile_idx, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -563,7 +557,7 @@ struct ReduceByKeyAgent key_type tile_pred_key = (threadIdx.x == 0) ? key_type(keys_load_it[tile_offset - 1]) : key_type(); - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -576,7 +570,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags BlockDiscontinuityKeys(storage.scan_storage.discontinuity) @@ -635,8 +629,8 @@ struct ReduceByKeyAgent int /*num_tiles*/, ScanTileState& tile_state) : storage(storage_) - , keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_)) - , values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_)) + , keys_load_it(core::detail::make_load_iterator(ptx_plan(), keys_input_it_)) + , values_load_it(core::detail::make_load_iterator(ptx_plan(), values_input_it_)) , keys_output_it(keys_output_it_) , values_output_it(values_output_it_) , num_runs_output_it(num_runs_output_it_) @@ -703,7 +697,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -740,8 +734,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; cudaError_t status = cudaSuccess; if (num_items == 0) @@ -762,7 +756,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( int tile_size = reduce_by_key_plan.items_per_tile; Size num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); size_t allocation_sizes[2] = {9, vshmem_size}; status = ScanTileState::AllocationSize(static_cast(num_tiles), allocation_sizes[0]); @@ -848,14 +842,14 @@ THRUST_RUNTIME_FUNCTION pair reduce_by_key_dispatc void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); Size* d_num_runs_out = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 7a267080bf8..b336f8e55fa 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -222,7 +222,7 @@ struct Tuning; namespace mpl = thrust::detail::mpl::math; template -struct Tuning +struct Tuning { enum { @@ -243,7 +243,7 @@ struct Tuning }; // tuning sm52 template -struct Tuning +struct Tuning { enum { @@ -290,15 +290,15 @@ struct SetOpAgent { using tuning = Tuning; - using KeysLoadIt1 = typename core::LoadIterator::type; - using KeysLoadIt2 = typename core::LoadIterator::type; - using ValuesLoadIt1 = typename core::LoadIterator::type; - using ValuesLoadIt2 = typename core::LoadIterator::type; + using KeysLoadIt1 = typename core::detail::LoadIterator::type; + using KeysLoadIt2 = typename core::detail::LoadIterator::type; + using ValuesLoadIt1 = typename core::detail::LoadIterator::type; + using ValuesLoadIt2 = typename core::detail::LoadIterator::type; - using BlockLoadKeys1 = typename core::BlockLoad::type; - using BlockLoadKeys2 = typename core::BlockLoad::type; - using BlockLoadValues1 = typename core::BlockLoad::type; - using BlockLoadValues2 = typename core::BlockLoad::type; + using BlockLoadKeys1 = typename core::detail::BlockLoad::type; + using BlockLoadKeys2 = typename core::detail::BlockLoad::type; + using BlockLoadValues1 = typename core::detail::BlockLoad::type; + using BlockLoadValues2 = typename core::detail::BlockLoad::type; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; @@ -316,7 +316,7 @@ struct SetOpAgent struct LoadStorage { - core::uninitialized_array offset; + core::detail::uninitialized_array offset; union { // FIXME These don't appear to be used anywhere? @@ -328,15 +328,15 @@ struct SetOpAgent // Allocate extra shmem than truly necessary // This will permit to avoid range checks in // serial set operations, e.g. serial_set_difference - core::uninitialized_array keys_shared; + core::detail::uninitialized_array keys_shared; - core::uninitialized_array values_shared; + core::detail::uninitialized_array values_shared; }; // anon union } load_storage; // struct LoadStorage }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt1 = typename ptx_plan::KeysLoadIt1; using KeysLoadIt2 = typename ptx_plan::KeysLoadIt2; @@ -441,8 +441,6 @@ struct SetOpAgent Size tile_output_prefix, int tile_output_count) { - using core::sync_threadblock; - int local_scatter_idx = thread_output_prefix - tile_output_prefix; # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -452,7 +450,7 @@ struct SetOpAgent shared[local_scatter_idx++] = input[ITEM]; } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < tile_output_count; item += BLOCK_THREADS) { @@ -483,8 +481,7 @@ struct SetOpAgent template void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; pair partition_beg = partitions[tile_idx + 0]; pair partition_end = partitions[tile_idx + 1]; @@ -506,7 +503,7 @@ struct SetOpAgent reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc); - sync_threadblock(); + __syncthreads(); int diag_loc = min(ITEMS_PER_THREAD * threadIdx.x, num_keys1 + num_keys2); @@ -529,7 +526,7 @@ struct SetOpAgent int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1; storage.load_storage.offset[dst] = value; - core::sync_threadblock(); + __syncthreads(); pair partition1_loc = thrust::make_pair( storage.load_storage.offset[threadIdx.x] >> 16, storage.load_storage.offset[threadIdx.x] & 0xFFFF); @@ -554,7 +551,7 @@ struct SetOpAgent indices, compare_op, set_op); - sync_threadblock(); + __syncthreads(); # if 0 if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2) active_mask = 0; @@ -588,7 +585,7 @@ struct SetOpAgent tile_output_prefix = prefix_cb.GetExclusivePrefix(); } - sync_threadblock(); + __syncthreads(); // scatter results // @@ -605,11 +602,11 @@ struct SetOpAgent value_type values_loc[ITEMS_PER_THREAD]; gmem_to_reg(values_loc, values1_in + keys1_beg, values2_in + keys2_beg, num_keys1, num_keys2); - sync_threadblock(); + __syncthreads(); reg_to_shared(&storage.load_storage.values_shared[0], values_loc); - sync_threadblock(); + __syncthreads(); // gather items from shared mem // @@ -622,7 +619,7 @@ struct SetOpAgent } } - sync_threadblock(); + __syncthreads(); scatter(values_out, values_loc, @@ -660,10 +657,10 @@ struct SetOpAgent std::size_t* output_count_) : storage(storage_) , tile_state(tile_state_) - , keys1_in(core::make_load_iterator(ptx_plan(), keys1_)) - , keys2_in(core::make_load_iterator(ptx_plan(), keys2_)) - , values1_in(core::make_load_iterator(ptx_plan(), values1_)) - , values2_in(core::make_load_iterator(ptx_plan(), values2_)) + , keys1_in(core::detail::make_load_iterator(ptx_plan(), keys1_)) + , keys2_in(core::detail::make_load_iterator(ptx_plan(), keys2_)) + , values1_in(core::detail::make_load_iterator(ptx_plan(), values1_)) + , values2_in(core::detail::make_load_iterator(ptx_plan(), values2_)) , keys1_count(keys1_count_) , keys2_count(keys2_count_) , keys_out(keys_out_) @@ -733,7 +730,7 @@ struct PartitionAgent struct PtxPlan : PtxPolicy<256> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -767,7 +764,7 @@ struct InitAgent struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -1058,8 +1055,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( cudaError_t status = cudaSuccess; - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; using set_op_agent = AgentLauncher< SetOpAgent>; @@ -1080,13 +1077,13 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( status = ScanTileState::AllocationSize(static_cast(num_tiles), tile_agent_storage); CUDA_CUB_RET_IF_FAIL(status); - size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size, num_tiles); + size_t vshmem_storage = core::detail::vshmem_size(set_op_plan.shared_memory_size, num_tiles); size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2; void* allocations[3] = {nullptr, nullptr, nullptr}; size_t allocation_sizes[3] = {tile_agent_storage, partition_agent_storage, vshmem_storage}; - status = core::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); CUDA_CUB_RET_IF_FAIL(status); if (d_temp_storage == nullptr) @@ -1192,14 +1189,14 @@ THRUST_RUNTIME_FUNCTION pair set_operations( size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage"); std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h index 2c3ef85202d..7ad67fd4e0c 100644 --- a/thrust/thrust/system/cuda/detail/sort.h +++ b/thrust/thrust/system/cuda/detail/sort.h @@ -58,6 +58,8 @@ # include # include +# include + # include # if defined(_CCCL_HAS_NVFP16) @@ -277,8 +279,8 @@ THRUST_RUNTIME_FUNCTION void radix_sort(execution_policy& policy, Key* dispatch::doit(nullptr, temp_storage_bytes, keys_buffer, items_buffer, keys_count, stream); cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step"); - size_t keys_temp_storage = core::align_to(sizeof(Key) * keys_count, 128); - size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128); + size_t keys_temp_storage = ::cuda::round_up(sizeof(Key) * keys_count, 128); + size_t items_temp_storage = ::cuda::round_up(sizeof(Item) * items_count, 128); size_t storage_size = keys_temp_storage + items_temp_storage + temp_storage_bytes; diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index ac94017758b..1d39b161866 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -123,7 +123,7 @@ struct items_per_thread }; template -struct Tuning +struct Tuning { const static int INPUT_SIZE = sizeof(T); enum @@ -149,16 +149,16 @@ struct UniqueAgent { using tuning = Tuning; - using ItemsLoadIt = typename core::LoadIterator::type; + using ItemsLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadItems = typename core::BlockLoad::type; + using BlockLoadItems = typename core::detail::BlockLoad::type; using BlockDiscontinuityItems = cub::BlockDiscontinuity; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; using BlockScan = cub::BlockScan; - using shared_items_t = core::uninitialized_array; + using shared_items_t = core::detail::uninitialized_array; union TempStorage { @@ -175,7 +175,7 @@ struct UniqueAgent }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using ItemsLoadIt = typename ptx_plan::ItemsLoadIt; using BlockLoadItems = typename ptx_plan::BlockLoadItems; @@ -224,8 +224,6 @@ struct UniqueAgent Size num_selections_prefix, Size /*num_selections*/) { - using core::sync_threadblock; - # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { @@ -236,14 +234,14 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { items_out[num_selections_prefix + item] = get_shared()[item]; } - sync_threadblock(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -253,8 +251,7 @@ struct UniqueAgent template Size THRUST_DEVICE_FUNCTION consume_tile_impl(int num_tile_items, int tile_idx, Size tile_base) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; item_type items_loc[ITEMS_PER_THREAD]; Size selection_flags[ITEMS_PER_THREAD]; @@ -270,7 +267,7 @@ struct UniqueAgent BlockLoadItems(temp_storage.load_items).Load(items_in + tile_base, items_loc); } - sync_threadblock(); + __syncthreads(); if (IS_FIRST_TILE) { @@ -294,7 +291,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); Size num_tile_selections = 0; Size num_selections = 0; @@ -337,7 +334,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); scatter(items_loc, selection_flags, @@ -420,7 +417,7 @@ struct UniqueAgent impl(storage, tile_state, - core::make_load_iterator(ptx_plan(), items_in), + core::detail::make_load_iterator(ptx_plan(), items_in), items_out, binary_pred, num_items, @@ -435,7 +432,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -463,9 +460,9 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::get_agent_plan; using unique_agent = AgentLauncher>; @@ -473,14 +470,14 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( using init_agent = AgentLauncher>; - using core::get_plan; + using core::detail::get_plan; typename get_plan::type init_plan = init_agent::get_plan(); typename get_plan::type unique_plan = unique_agent::get_plan(stream); int tile_size = unique_plan.items_per_tile; size_t num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(unique_plan.shared_memory_size, num_tiles); cudaError_t status = cudaSuccess; size_t allocation_sizes[2] = {0, vshmem_size}; @@ -550,14 +547,14 @@ THRUST_RUNTIME_FUNCTION ItemsOutputIt unique( void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 1st step"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 2nd step"); size_type* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[0]); From 9a27ba3ba2da14dd9b8bd22c04ea057d9a7f493b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:07:45 +0100 Subject: [PATCH 10/15] PTX: Add clusterlaunchcontrol (#3589) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + .../ptx/instructions/clusterlaunchcontrol.rst | 11 ++++++ .../__ptx/instructions/clusterlaunchcontrol.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../ptx.clusterlaunchcontrol.compile.pass.cpp | 22 +++++++++++ 5 files changed, 72 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index f0776974eec..32db843c28d 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -7,6 +7,7 @@ PTX Instructions :maxdepth: 1 instructions/barrier_cluster + instructions/clusterlaunchcontrol instructions/cp_async_bulk instructions/cp_async_bulk_commit_group instructions/cp_async_bulk_wait_group diff --git a/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst new file mode 100644 index 00000000000..75fe44f6f22 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst @@ -0,0 +1,11 @@ +.. _libcudacxx-ptx-instructions-clusterlaunchcontrol: + +clusterlaunchcontrol +==================== + +- PTX ISA: + `clusterlaunchcontrol.try_cancel `__ +- PTX ISA: + `clusterlaunchcontrol.query_cancel `__ + +.. include:: generated/clusterlaunchcontrol.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h new file mode 100644 index 00000000000..b15cfddf4a0 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ +#define _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 4798973df77..7087dd97d2a 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -70,6 +70,7 @@ */ #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp new file mode 100644 index 00000000000..212414c4535 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/clusterlaunchcontrol.h" + +int main(int, char**) +{ + return 0; +} From b1f2e63dafcb8d1379819e80375b1cd33393f449 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:10:21 +0100 Subject: [PATCH 11/15] PTX: Add st.bulk (#3604) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + docs/libcudacxx/ptx/instructions/st_bulk.rst | 9 +++++ .../include/cuda/__ptx/instructions/st_bulk.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../cuda/ptx/ptx.st.bulk.compile.pass.cpp | 22 +++++++++++ 5 files changed, 70 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/st_bulk.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/st_bulk.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 32db843c28d..ebf6e31f716 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -24,6 +24,7 @@ PTX Instructions instructions/mbarrier_try_wait instructions/red_async instructions/st_async + instructions/st_bulk instructions/tensormap_replace instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/st_bulk.rst b/docs/libcudacxx/ptx/instructions/st_bulk.rst new file mode 100644 index 00000000000..64886598909 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/st_bulk.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-st-bulk: + +st.bulk +======= + +- PTX ISA: + `st.bulk `__ + +.. include:: generated/st_bulk.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h new file mode 100644 index 00000000000..686e0ecf166 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_ST_BULK_H_ +#define _CUDA_PTX_ST_BULK_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_ST_BULK_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 7087dd97d2a..db9e70ab7e6 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -86,6 +86,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp new file mode 100644 index 00000000000..951e1a9f513 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/st_bulk.h" + +int main(int, char**) +{ + return 0; +} From afa2ca25d00fc9bd8037b3b2ca064f2c18708bfc Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:10:35 +0100 Subject: [PATCH 12/15] PTX: Add multimem instructions (#3603) * Add multimem.ld_reduce * Add multimem.red * Add multimem.st Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 3 ++ .../ptx/instructions/multimem_ld_reduce.rst | 9 +++++ .../ptx/instructions/multimem_red.rst | 9 +++++ .../ptx/instructions/multimem_st.rst | 9 +++++ .../__ptx/instructions/multimem_ld_reduce.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/multimem_red.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/multimem_st.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 3 ++ .../ptx.multimem.ld_reduce.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.multimem.red.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.multimem.st.compile.pass.cpp | 22 +++++++++++ 11 files changed, 210 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst create mode 100644 docs/libcudacxx/ptx/instructions/multimem_red.rst create mode 100644 docs/libcudacxx/ptx/instructions/multimem_st.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_red.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index ebf6e31f716..797e26d9911 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -22,6 +22,9 @@ PTX Instructions instructions/mbarrier_expect_tx instructions/mbarrier_test_wait instructions/mbarrier_try_wait + instructions/multimem_ld_reduce + instructions/multimem_red + instructions/multimem_st instructions/red_async instructions/st_async instructions/st_bulk diff --git a/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst new file mode 100644 index 00000000000..e9f5212131b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-ld_reduce: + +multimem.ld_reduce +================== + +- PTX ISA: + `multimem.ld_reduce `__ + +.. include:: generated/multimem_ld_reduce.rst diff --git a/docs/libcudacxx/ptx/instructions/multimem_red.rst b/docs/libcudacxx/ptx/instructions/multimem_red.rst new file mode 100644 index 00000000000..0a6511b78d1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_red.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-red: + +multimem.red +============ + +- PTX ISA: + `multimem.red `__ + +.. include:: generated/multimem_red.rst diff --git a/docs/libcudacxx/ptx/instructions/multimem_st.rst b/docs/libcudacxx/ptx/instructions/multimem_st.rst new file mode 100644 index 00000000000..75197f440c6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_st.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-st: + +multimem.st +=========== + +- PTX ISA: + `multimem.st `__ + +.. include:: generated/multimem_st.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h new file mode 100644 index 00000000000..29081e6107e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ +#define _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h new file mode 100644 index 00000000000..f0fc4e4d0e5 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_RED_H_ +#define _CUDA_PTX_MULTIMEM_RED_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_RED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h new file mode 100644 index 00000000000..608402f0131 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_ST_H_ +#define _CUDA_PTX_MULTIMEM_ST_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_ST_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index db9e70ab7e6..d11659ac6fb 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -84,6 +84,9 @@ #include #include #include +#include +#include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp new file mode 100644 index 00000000000..cbe0ba81971 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_ld_reduce.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp new file mode 100644 index 00000000000..b4aefa3b338 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_red.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp new file mode 100644 index 00000000000..4998c854382 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_st.h" + +int main(int, char**) +{ + return 0; +} From 0f52dd50c8a049372dfba62950f490813c2217ea Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:38:22 +0100 Subject: [PATCH 13/15] PTX: Add cp.async.mbarrier.arrive{.noinc} (#3602) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + .../instructions/cp_async_mbarrier_arrive.rst | 10 +++++ .../instructions/cp_async_mbarrier_arrive.h | 38 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + ....cp.async.mbarrier.arrive.compile.pass.cpp | 23 +++++++++++ 5 files changed, 73 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 797e26d9911..87ccc82b5b1 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -12,6 +12,7 @@ PTX Instructions instructions/cp_async_bulk_commit_group instructions/cp_async_bulk_wait_group instructions/cp_async_bulk_tensor + instructions/cp_async_mbarrier_arrive instructions/cp_reduce_async_bulk instructions/cp_reduce_async_bulk_tensor instructions/fence diff --git a/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst new file mode 100644 index 00000000000..f2ff2ff5ee7 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst @@ -0,0 +1,10 @@ +.. _libcudacxx-ptx-instructions-cp-async-mbarrier-arrive: + +cp.async.mbarrier.arrive +======================== + +- PTX ISA: + `cp.async.mbarrier.arrive `__ + +.. include:: generated/cp_async_mbarrier_arrive.rst +.. include:: generated/cp_async_mbarrier_arrive_noinc.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..c19a09e2922 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h @@ -0,0 +1,38 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index d11659ac6fb..0d699b2e2ca 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp new file mode 100644 index 00000000000..97623078198 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/cp_async_mbarrier_arrive.h" +#include "generated/cp_async_mbarrier_arrive_noinc.h" + +int main(int, char**) +{ + return 0; +} From 38983ebc42de5683e212562c931aa0789c6eefe7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 16:40:27 +0100 Subject: [PATCH 14/15] PTX: Add tcgen05 instructions (#3607) * ptx: Add tcgen05.alloc * ptx: Add tcgen05.commit * ptx: Add tcgen05.cp * ptx: Add tcgen05.fence * ptx: Add tcgen05.ld * ptx: Add tcgen05.mma * ptx: Add tcgen05.mma.ws * ptx: Add tcgen05.shift * ptx: Add tcgen05.st * ptx: Add tcgen05.wait * fix docs --------- Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 10 +++++ .../ptx/instructions/tcgen05_alloc.rst | 9 +++++ .../ptx/instructions/tcgen05_commit.rst | 9 +++++ .../ptx/instructions/tcgen05_cp.rst | 9 +++++ .../ptx/instructions/tcgen05_fence.rst | 9 +++++ .../ptx/instructions/tcgen05_ld.rst | 9 +++++ .../ptx/instructions/tcgen05_mma.rst | 9 +++++ .../ptx/instructions/tcgen05_mma_ws.rst | 9 +++++ .../ptx/instructions/tcgen05_shift.rst | 9 +++++ .../ptx/instructions/tcgen05_st.rst | 9 +++++ .../ptx/instructions/tcgen05_wait.rst | 9 +++++ .../cuda/__ptx/instructions/tcgen05_alloc.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_commit.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_cp.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_fence.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_ld.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_mma.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_mma_ws.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_shift.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_st.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_wait.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 10 +++++ .../ptx/ptx.tcgen05.alloc.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.commit.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.fence.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.mma.ws.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.shift.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.st.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.wait.compile.pass.cpp | 22 +++++++++++ 32 files changed, 700 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_commit.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_cp.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_ld.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_shift.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_wait.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 87ccc82b5b1..136dfb81fc3 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -29,6 +29,16 @@ PTX Instructions instructions/red_async instructions/st_async instructions/st_bulk + instructions/tcgen05_alloc + instructions/tcgen05_commit + instructions/tcgen05_cp + instructions/tcgen05_fence + instructions/tcgen05_ld + instructions/tcgen05_mma + instructions/tcgen05_mma_ws + instructions/tcgen05_shift + instructions/tcgen05_st + instructions/tcgen05_wait instructions/tensormap_replace instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst new file mode 100644 index 00000000000..a30f2a2560c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-alloc: + +tcgen05.alloc +============= + +- PTX ISA: + `tcgen05.alloc `__ + +.. include:: generated/tcgen05_alloc.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst new file mode 100644 index 00000000000..a431350dea8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-commit: + +tcgen05.commit +============== + +- PTX ISA: + `tcgen05.commit `__ + +.. include:: generated/tcgen05_commit.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst new file mode 100644 index 00000000000..5a220536d6e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-cp: + +tcgen05.cp +========== + +- PTX ISA: + `tcgen05.cp `__ + +.. include:: generated/tcgen05_cp.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst new file mode 100644 index 00000000000..6635131f707 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-fence: + +tcgen05.fence +============= + +- PTX ISA: + `tcgen05.fence `__ + +.. include:: generated/tcgen05_fence.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst new file mode 100644 index 00000000000..165b8eb935a --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-ld: + +tcgen05.ld +========== + +- PTX ISA: + `tcgen05.ld `__ + +.. include:: generated/tcgen05_ld.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst new file mode 100644 index 00000000000..9672ae0d0a1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-mma: + +tcgen05.mma +=========== + +- PTX ISA: + `tcgen05.mma `__ + +.. include:: generated/tcgen05_mma.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst new file mode 100644 index 00000000000..e22066298ac --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-mma-ws: + +tcgen05.mma.ws +============== + +- PTX ISA: + `tcgen05.mma.ws `__ + +.. include:: generated/tcgen05_mma_ws.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst new file mode 100644 index 00000000000..eef04ae4d5e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-shift: + +tcgen05.shift +============= + +- PTX ISA: + `tcgen05.shift `__ + +.. include:: generated/tcgen05_shift.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst new file mode 100644 index 00000000000..f101149481f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-st: + +tcgen05.st +========== + +- PTX ISA: + `tcgen05.st `__ + +.. include:: generated/tcgen05_st.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst new file mode 100644 index 00000000000..cb149e5c9a1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-wait: + +tcgen05.wait +============ + +- PTX ISA: + `tcgen05.wait `__ + +.. include:: generated/tcgen05_wait.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h new file mode 100644 index 00000000000..743ee4306ee --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_ALLOC_H_ +#define _CUDA_PTX_TCGEN05_ALLOC_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_ALLOC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h new file mode 100644 index 00000000000..ca06ec6b97d --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_COMMIT_H_ +#define _CUDA_PTX_TCGEN05_COMMIT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_COMMIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h new file mode 100644 index 00000000000..e0c6ebf74ad --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_CP_H_ +#define _CUDA_PTX_TCGEN05_CP_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_CP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h new file mode 100644 index 00000000000..a36847cd0f3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_FENCE_H_ +#define _CUDA_PTX_TCGEN05_FENCE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h new file mode 100644 index 00000000000..782ba20e804 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_LD_H_ +#define _CUDA_PTX_TCGEN05_LD_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_LD_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h new file mode 100644 index 00000000000..ff9d159930b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_MMA_H_ +#define _CUDA_PTX_TCGEN05_MMA_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_MMA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h new file mode 100644 index 00000000000..5d0bd5b8b5a --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_MMA_WS_H_ +#define _CUDA_PTX_TCGEN05_MMA_WS_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_MMA_WS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h new file mode 100644 index 00000000000..aab5cbe27b8 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_SHIFT_H_ +#define _CUDA_PTX_TCGEN05_SHIFT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_SHIFT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h new file mode 100644 index 00000000000..94c86614b1e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_ST_H_ +#define _CUDA_PTX_TCGEN05_ST_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h new file mode 100644 index 00000000000..1684d9afd65 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_WAIT_H_ +#define _CUDA_PTX_TCGEN05_WAIT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_WAIT_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 0d699b2e2ca..971288b456c 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -91,6 +91,16 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp new file mode 100644 index 00000000000..49f9df928e9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_alloc.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp new file mode 100644 index 00000000000..73ea1851bec --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_commit.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp new file mode 100644 index 00000000000..85ddc17efe4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_cp.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp new file mode 100644 index 00000000000..fda57b348de --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_fence.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp new file mode 100644 index 00000000000..8da8e54f18d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_ld.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp new file mode 100644 index 00000000000..098cbbfa896 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_mma.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp new file mode 100644 index 00000000000..350c964d749 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_mma_ws.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp new file mode 100644 index 00000000000..5ecfff7ff3b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_shift.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp new file mode 100644 index 00000000000..92a49224f0e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_st.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp new file mode 100644 index 00000000000..4bb3156ed12 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_wait.h" + +int main(int, char**) +{ + return 0; +} From cea61a3410fdea796154dcd9157e010659aab837 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 30 Jan 2025 16:48:09 +0100 Subject: [PATCH 15/15] Use a differrent implementation for `tuple_of_iterator_references` to tuple conversion (#3609) --- .../include/cuda/std/detail/libcxx/include/tuple | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple index 6ff1039e61b..47f8b16222b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple @@ -891,10 +891,19 @@ public: enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0, enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(_Tp)), int> = 0> _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t) - : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t).template __to_tuple<_Tp...>( - __make_tuple_indices_t())) + : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t), + typename __make_tuple_indices::type{}) {} +private: + template ::value, int> = 0> + _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t, __tuple_indices<_Indices...>) + : tuple(_CUDA_VSTD::get<_Indices>(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t))...) + {} + +public: template , enable_if_t::value, int> = 0,