From ffb0b1e4f99fdfa0e70181a83fb5cb2a0f3b07d7 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:50:03 +0100 Subject: [PATCH] implement C++23 `byteswap` (#3093) --- docs/libcudacxx/standard_api.rst | 2 + libcudacxx/include/cuda/std/__bit/byteswap.h | 173 ++++++++++++++++++ libcudacxx/include/cuda/std/__cccl/builtin.h | 24 +++ libcudacxx/include/cuda/std/bit | 1 + libcudacxx/include/cuda/std/version | 2 +- .../std/numerics/bit/byteswap.pass.cpp | 143 +++++++++++++++ 6 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 libcudacxx/include/cuda/std/__bit/byteswap.h create mode 100644 libcudacxx/test/libcudacxx/std/numerics/bit/byteswap.pass.cpp diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst index 7897b5d3d1c..be806240615 100644 --- a/docs/libcudacxx/standard_api.rst +++ b/docs/libcudacxx/standard_api.rst @@ -97,6 +97,8 @@ Feature availability: - C++20 integer comparison functions in ```` are available in C++11. +- C++23 ``byteswap`` in ```` is available in C++11. + - C++23 ```` is available in C++14. - all features are available in C++14 diff --git a/libcudacxx/include/cuda/std/__bit/byteswap.h b/libcudacxx/include/cuda/std/__bit/byteswap.h new file mode 100644 index 00000000000..8bcad204338 --- /dev/null +++ b/libcudacxx/include/cuda/std/__bit/byteswap.h @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___BIT_BYTESWAP_H +#define _LIBCUDACXX___BIT_BYTESWAP_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#if _CCCL_COMPILER(MSVC) +# include +#endif // _CCCL_COMPILER(MSVC) + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +class __byteswap_impl +{ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Full __impl_recursive(_Full __val) noexcept + { + static_assert(sizeof(_Full) == sizeof(_Half) * 2, "Invalid half type passed to __bytswap_impl"); + + return static_cast<_Full>(__impl(static_cast<_Half>(__val >> CHAR_BIT * sizeof(_Half)))) + | (static_cast<_Full>(__impl(static_cast<_Half>(__val))) << CHAR_BIT * sizeof(_Half)); + } + +#if _CCCL_HAS_CUDA_COMPILER + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static uint32_t __impl_device(uint32_t __val) noexcept + { + uint32_t __result; + asm("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(__result) : "r"(__val)); + return __result; + } + + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static uint64_t __impl_device(uint64_t __val) noexcept + { + uint32_t __hi; + uint32_t __lo; + asm("mov.b64 {%0, %1}, %2;" : "=r"(__hi), "=r"(__lo) : "l"(__val)); + asm("prmt.b32 %0, %0, 0, 0x0123;" : "+r"(__hi)); + asm("prmt.b32 %0, %0, 0, 0x0123;" : "+r"(__lo)); + + uint64_t __result; + asm("mov.b64 %0, {%1, %2};" : "=l"(__result) : "r"(__lo), "r"(__hi)); + + return __result; + } +#endif // _CCCL_HAS_CUDA_COMPILER + +public: + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 _Tp __impl(_Tp __val) noexcept + { + _Tp __result{}; + for (size_t __i{}; __i < sizeof(__val); ++__i) + { + __result <<= CHAR_BIT; + __result |= (__val >> (__i * CHAR_BIT)) & static_cast<_Tp>(UCHAR_MAX); + } + return __result; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint16_t __impl(uint16_t __val) noexcept + { +#if defined(_CCCL_BUILTIN_BSWAP16) + return _CCCL_BUILTIN_BSWAP16(__val); +#else // ^^^ _CCCL_BUILTIN_BSWAP16 ^^^ / vvv !_CCCL_BUILTIN_BSWAP16 vvv +# if _CCCL_STD_VER >= 2014 && _CCCL_COMPILER(MSVC) + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_HOST, return _byteswap_ushort(__val);) + } +# endif // _CCCL_STD_VER >= 2014 && _CCCL_COMPILER(MSVC) + return (__val << CHAR_BIT) | (__val >> CHAR_BIT); +#endif // !_CCCL_BUILTIN_BSWAP16 + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint32_t __impl(uint32_t __val) noexcept + { +#if defined(_CCCL_BUILTIN_BSWAP32) + return _CCCL_BUILTIN_BSWAP32(__val); +#else // ^^^ _CCCL_BUILTIN_BSWAP32 ^^^ / vvv !_CCCL_BUILTIN_BSWAP32 vvv +# if _CCCL_STD_VER >= 2014 + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { +# if _CCCL_COMPILER(MSVC) + NV_IF_TARGET(NV_IS_HOST, return _byteswap_ulong(__val);) +# endif // _CCCL_COMPILER(MSVC) + NV_IF_TARGET(NV_IS_DEVICE, return __impl_device(__val);) + } +# endif // _CCCL_STD_VER >= 2014 + return __impl_recursive(__val); +#endif // !_CCCL_BUILTIN_BSWAP32 + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint64_t __impl(uint64_t __val) noexcept + { +#if defined(_CCCL_BUILTIN_BSWAP64) + return _CCCL_BUILTIN_BSWAP64(__val); +#else // ^^^ _CCCL_BUILTIN_BSWAP64 ^^^ / vvv !_CCCL_BUILTIN_BSWAP64 vvv +# if _CCCL_STD_VER >= 2014 + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { +# if _CCCL_COMPILER(MSVC) + NV_IF_TARGET(NV_IS_HOST, return _byteswap_uint64(__val);) +# endif // _CCCL_COMPILER(MSVC) + NV_IF_TARGET(NV_IS_DEVICE, return __impl_device(__val);) + } +# endif // _CCCL_STD_VER >= 2014 + return __impl_recursive(__val); +#endif // !_CCCL_BUILTIN_BSWAP64 + } + +#if !defined(_LIBCUDACXX_HAS_NO_INT128) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __uint128_t __impl(__uint128_t __val) noexcept + { +# if defined(_CCCL_BUILTIN_BSWAP128) + return _CCCL_BUILTIN_BSWAP128(__val); +# else // ^^^ _CCCL_BUILTIN_BSWAP128 ^^^ / vvv !_CCCL_BUILTIN_BSWAP128 vvv + return __impl_recursive(__val); +# endif // !_CCCL_BUILTIN_BSWAP128 + } +#endif // !_LIBCUDACXX_HAS_NO_INT128 +}; + +_CCCL_TEMPLATE(class _Integer) +_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) == 1)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Integer byteswap(_Integer __val) noexcept +{ + return __val; +} + +_CCCL_TEMPLATE(class _Integer) +_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) > 1) + _CCCL_AND(has_single_bit(sizeof(_Integer)))) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Integer byteswap(_Integer __val) noexcept +{ + return static_cast<_Integer>(__byteswap_impl::__impl(_CUDA_VSTD::__to_unsigned_like(__val))); +} + +_CCCL_TEMPLATE(class _Integer) +_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) > 1) + _CCCL_AND(!has_single_bit(sizeof(_Integer)))) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Integer byteswap(_Integer __val) noexcept +{ + return static_cast<_Integer>(__byteswap_impl::__impl(_CUDA_VSTD::__to_unsigned_like(__val))); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___BIT_BYTESWAP_H diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index 0ea34bdc55d..b19f7de4371 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -126,6 +126,30 @@ # undef _CCCL_BUILTIN_BIT_CAST #endif // clang < 10 || nvcc < 11.7 +#if _CCCL_CHECK_BUILTIN(builtin_bswap16) +# define _CCCL_BUILTIN_BSWAP16(...) __builtin_bswap16(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_bswap16) + +#if _CCCL_CHECK_BUILTIN(builtin_bswap32) +# define _CCCL_BUILTIN_BSWAP32(...) __builtin_bswap32(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_bswap32) + +#if _CCCL_CHECK_BUILTIN(builtin_bswap64) +# define _CCCL_BUILTIN_BSWAP64(...) __builtin_bswap64(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_bswap64) + +#if _CCCL_CHECK_BUILTIN(builtin_bswap128) +# define _CCCL_BUILTIN_BSWAP128(...) __builtin_bswap128(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_bswap128) + +// NVCC cannot handle builtins for bswap +#if _CCCL_CUDA_COMPILER(NVCC) +# undef _CCCL_BUILTIN_BSWAP16 +# undef _CCCL_BUILTIN_BSWAP32 +# undef _CCCL_BUILTIN_BSWAP64 +# undef _CCCL_BUILTIN_BSWAP128 +#endif // _CCCL_CUDA_COMPILER(NVCC) + #if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit index 72c98ff7bef..b0f33a48c22 100644 --- a/libcudacxx/include/cuda/std/bit +++ b/libcudacxx/include/cuda/std/bit @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version index 7f22ec2e5cf..557112fb064 100644 --- a/libcudacxx/include/cuda/std/version +++ b/libcudacxx/include/cuda/std/version @@ -33,6 +33,7 @@ #define __cccl_lib_bool_constant 201505L #define __cccl_lib_bounded_array_traits 201902L #define __cccl_lib_byte 201603L +#define __cccl_lib_byteswap 202110L #define __cccl_lib_clamp 201603L #define __cccl_lib_endian 201907L #define __cccl_lib_forward_like 202207L @@ -229,7 +230,6 @@ // # define __cccl_lib_allocate_at_least 202106L // # define __cccl_lib_associative_heterogeneous_erasure 202110L // # define __cccl_lib_bind_back 202202L -// # define __cccl_lib_byteswap 202110L // # define __cccl_lib_constexpr_bitset 202207L // # define __cccl_lib_constexpr_charconv 202207L // # define __cccl_lib_constexpr_cmath 202202L diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/byteswap.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/byteswap.pass.cpp new file mode 100644 index 00000000000..0016236371c --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/byteswap.pass.cpp @@ -0,0 +1,143 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "test_macros.h" + +template +struct has_byteswap : cuda::std::false_type +{}; + +template +struct has_byteswap()))>> : cuda::std::true_type +{}; + +enum class Byte : cuda::std::uint8_t +{ +}; + +static_assert(!has_byteswap::value, ""); +static_assert(!has_byteswap::value, ""); +static_assert(!has_byteswap::value, ""); +static_assert(!has_byteswap::value, ""); + +template +struct MakeUnsigned +{ + using type = cuda::std::make_unsigned_t; +}; + +template <> +struct MakeUnsigned +{ + using type = bool; +}; + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_num(T in, T expected) +{ + using U = typename MakeUnsigned::type; + + assert(static_cast(cuda::std::byteswap(in)) == static_cast(expected)); + ASSERT_SAME_TYPE(decltype(cuda::std::byteswap(in)), decltype(in)); + ASSERT_NOEXCEPT(cuda::std::byteswap(in)); +} + +template +struct TestData +{ + T in; + T expected; +}; + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 TestData get_test_data() +{ + switch (sizeof(T)) + { + case 2: + return {static_cast(0x1234), static_cast(0x3412)}; + case 4: + return {static_cast(0x60AF8503), static_cast(0x0385AF60)}; + case 8: + return {static_cast(0xABCDFE9477936406), static_cast(0x0664937794FECDAB)}; + default: + assert(false); + cuda::std::unreachable(); + } +} + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 void test_implementation_defined_size() +{ + const auto test_data = get_test_data(); + test_num(test_data.in, test_data.expected); +} + +__host__ __device__ TEST_CONSTEXPR_CXX14 bool test() +{ + test_num(0xAB, 0xAB); + test_num(0xCDEF, 0xEFCD); + test_num(0x01234567, 0x67452301); + test_num(0x0123456789ABCDEF, 0xEFCDAB8967452301); + + test_num(static_cast(0xAB), static_cast(0xAB)); + test_num(static_cast(0xCDEF), static_cast(0xEFCD)); + test_num(0x01234567, 0x67452301); + // requires static_cast to silence integer conversion resulted in a change of sign warning + test_num( + static_cast(0x0123456789ABCDEF), static_cast(0xEFCDAB8967452301)); + +#if !defined(TEST_HAS_NO_INT128_T) + const auto in = static_cast<__uint128_t>(0x0123456789ABCDEF) << 64 | 0x13579BDF02468ACE; + const auto expected = static_cast<__uint128_t>(0xCE8A4602DF9B5713) << 64 | 0xEFCDAB8967452301; + test_num<__uint128_t>(in, expected); + test_num<__int128_t>(in, expected); +#endif // !defined(TEST_HAS_NO_INT128_T) + + test_num(true, true); + test_num(false, false); + test_num(static_cast(0xCD), static_cast(0xCD)); + test_num(0xEF, 0xEF); + test_num(0x45, 0x45); +#if TEST_STD_VER >= 2020 + test_num(0xAB, 0xAB); +#endif // TEST_STD_VER >= 2020 + test_num(0xABCD, 0xCDAB); + test_num(0xABCDEF01, 0x01EFCDAB); +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test_implementation_defined_size(); +#endif + + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + test_implementation_defined_size(); + return true; +} + +int main(int, char**) +{ + test(); +#if TEST_STD_VER >= 2014 + static_assert(test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +}