Skip to content

Commit

Permalink
implement C++23 byteswap (#3093)
Browse files Browse the repository at this point in the history
  • Loading branch information
davebayer authored Dec 16, 2024
1 parent c80fce9 commit ffb0b1e
Show file tree
Hide file tree
Showing 6 changed files with 344 additions and 1 deletion.
2 changes: 2 additions & 0 deletions docs/libcudacxx/standard_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ Feature availability:

- C++20 integer comparison functions in ``<utility>`` are available in C++11.

- C++23 ``byteswap`` in ``<bit>`` is available in C++11.

- C++23 ``<expected>`` is available in C++14.

- all features are available in C++14
Expand Down
173 changes: 173 additions & 0 deletions libcudacxx/include/cuda/std/__bit/byteswap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX___BIT_BYTESWAP_H
#define _LIBCUDACXX___BIT_BYTESWAP_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__bit/has_single_bit.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/climits>
#include <cuda/std/cstdint>

#if _CCCL_COMPILER(MSVC)
# include <intrin.h>
#endif // _CCCL_COMPILER(MSVC)

_LIBCUDACXX_BEGIN_NAMESPACE_STD

class __byteswap_impl
{
template <class _Half, class _Full>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Full __impl_recursive(_Full __val) noexcept
{
static_assert(sizeof(_Full) == sizeof(_Half) * 2, "Invalid half type passed to __bytswap_impl");

return static_cast<_Full>(__impl(static_cast<_Half>(__val >> CHAR_BIT * sizeof(_Half))))
| (static_cast<_Full>(__impl(static_cast<_Half>(__val))) << CHAR_BIT * sizeof(_Half));
}

#if _CCCL_HAS_CUDA_COMPILER
_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static uint32_t __impl_device(uint32_t __val) noexcept
{
uint32_t __result;
asm("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(__result) : "r"(__val));
return __result;
}

_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static uint64_t __impl_device(uint64_t __val) noexcept
{
uint32_t __hi;
uint32_t __lo;
asm("mov.b64 {%0, %1}, %2;" : "=r"(__hi), "=r"(__lo) : "l"(__val));
asm("prmt.b32 %0, %0, 0, 0x0123;" : "+r"(__hi));
asm("prmt.b32 %0, %0, 0, 0x0123;" : "+r"(__lo));

uint64_t __result;
asm("mov.b64 %0, {%1, %2};" : "=l"(__result) : "r"(__lo), "r"(__hi));

return __result;
}
#endif // _CCCL_HAS_CUDA_COMPILER

public:
template <class _Tp>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 _Tp __impl(_Tp __val) noexcept
{
_Tp __result{};
for (size_t __i{}; __i < sizeof(__val); ++__i)
{
__result <<= CHAR_BIT;
__result |= (__val >> (__i * CHAR_BIT)) & static_cast<_Tp>(UCHAR_MAX);
}
return __result;
}

_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint16_t __impl(uint16_t __val) noexcept
{
#if defined(_CCCL_BUILTIN_BSWAP16)
return _CCCL_BUILTIN_BSWAP16(__val);
#else // ^^^ _CCCL_BUILTIN_BSWAP16 ^^^ / vvv !_CCCL_BUILTIN_BSWAP16 vvv
# if _CCCL_STD_VER >= 2014 && _CCCL_COMPILER(MSVC)
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
NV_IF_TARGET(NV_IS_HOST, return _byteswap_ushort(__val);)
}
# endif // _CCCL_STD_VER >= 2014 && _CCCL_COMPILER(MSVC)
return (__val << CHAR_BIT) | (__val >> CHAR_BIT);
#endif // !_CCCL_BUILTIN_BSWAP16
}

_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint32_t __impl(uint32_t __val) noexcept
{
#if defined(_CCCL_BUILTIN_BSWAP32)
return _CCCL_BUILTIN_BSWAP32(__val);
#else // ^^^ _CCCL_BUILTIN_BSWAP32 ^^^ / vvv !_CCCL_BUILTIN_BSWAP32 vvv
# if _CCCL_STD_VER >= 2014
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
# if _CCCL_COMPILER(MSVC)
NV_IF_TARGET(NV_IS_HOST, return _byteswap_ulong(__val);)
# endif // _CCCL_COMPILER(MSVC)
NV_IF_TARGET(NV_IS_DEVICE, return __impl_device(__val);)
}
# endif // _CCCL_STD_VER >= 2014
return __impl_recursive<uint16_t>(__val);
#endif // !_CCCL_BUILTIN_BSWAP32
}

_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint64_t __impl(uint64_t __val) noexcept
{
#if defined(_CCCL_BUILTIN_BSWAP64)
return _CCCL_BUILTIN_BSWAP64(__val);
#else // ^^^ _CCCL_BUILTIN_BSWAP64 ^^^ / vvv !_CCCL_BUILTIN_BSWAP64 vvv
# if _CCCL_STD_VER >= 2014
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
# if _CCCL_COMPILER(MSVC)
NV_IF_TARGET(NV_IS_HOST, return _byteswap_uint64(__val);)
# endif // _CCCL_COMPILER(MSVC)
NV_IF_TARGET(NV_IS_DEVICE, return __impl_device(__val);)
}
# endif // _CCCL_STD_VER >= 2014
return __impl_recursive<uint32_t>(__val);
#endif // !_CCCL_BUILTIN_BSWAP64
}

#if !defined(_LIBCUDACXX_HAS_NO_INT128)
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __uint128_t __impl(__uint128_t __val) noexcept
{
# if defined(_CCCL_BUILTIN_BSWAP128)
return _CCCL_BUILTIN_BSWAP128(__val);
# else // ^^^ _CCCL_BUILTIN_BSWAP128 ^^^ / vvv !_CCCL_BUILTIN_BSWAP128 vvv
return __impl_recursive<uint64_t>(__val);
# endif // !_CCCL_BUILTIN_BSWAP128
}
#endif // !_LIBCUDACXX_HAS_NO_INT128
};

_CCCL_TEMPLATE(class _Integer)
_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) == 1))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Integer byteswap(_Integer __val) noexcept
{
return __val;
}

_CCCL_TEMPLATE(class _Integer)
_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) > 1)
_CCCL_AND(has_single_bit(sizeof(_Integer))))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Integer byteswap(_Integer __val) noexcept
{
return static_cast<_Integer>(__byteswap_impl::__impl(_CUDA_VSTD::__to_unsigned_like(__val)));
}

_CCCL_TEMPLATE(class _Integer)
_CCCL_REQUIRES(_CCCL_TRAIT(is_integral, _Integer) _CCCL_AND(sizeof(_Integer) > 1)
_CCCL_AND(!has_single_bit(sizeof(_Integer))))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Integer byteswap(_Integer __val) noexcept
{
return static_cast<_Integer>(__byteswap_impl::__impl(_CUDA_VSTD::__to_unsigned_like(__val)));
}

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX___BIT_BYTESWAP_H
24 changes: 24 additions & 0 deletions libcudacxx/include/cuda/std/__cccl/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,30 @@
# undef _CCCL_BUILTIN_BIT_CAST
#endif // clang < 10 || nvcc < 11.7

#if _CCCL_CHECK_BUILTIN(builtin_bswap16)
# define _CCCL_BUILTIN_BSWAP16(...) __builtin_bswap16(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_bswap16)

#if _CCCL_CHECK_BUILTIN(builtin_bswap32)
# define _CCCL_BUILTIN_BSWAP32(...) __builtin_bswap32(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_bswap32)

#if _CCCL_CHECK_BUILTIN(builtin_bswap64)
# define _CCCL_BUILTIN_BSWAP64(...) __builtin_bswap64(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_bswap64)

#if _CCCL_CHECK_BUILTIN(builtin_bswap128)
# define _CCCL_BUILTIN_BSWAP128(...) __builtin_bswap128(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_bswap128)

// NVCC cannot handle builtins for bswap
#if _CCCL_CUDA_COMPILER(NVCC)
# undef _CCCL_BUILTIN_BSWAP16
# undef _CCCL_BUILTIN_BSWAP32
# undef _CCCL_BUILTIN_BSWAP64
# undef _CCCL_BUILTIN_BSWAP128
#endif // _CCCL_CUDA_COMPILER(NVCC)

#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27)
# define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN()
#else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv
Expand Down
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/bit
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#endif // no system header

#include <cuda/std/__bit/bit_cast.h>
#include <cuda/std/__bit/byteswap.h>
#include <cuda/std/__bit/countl.h>
#include <cuda/std/__bit/countr.h>
#include <cuda/std/__bit/endian.h>
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/include/cuda/std/version
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#define __cccl_lib_bool_constant 201505L
#define __cccl_lib_bounded_array_traits 201902L
#define __cccl_lib_byte 201603L
#define __cccl_lib_byteswap 202110L
#define __cccl_lib_clamp 201603L
#define __cccl_lib_endian 201907L
#define __cccl_lib_forward_like 202207L
Expand Down Expand Up @@ -229,7 +230,6 @@
// # define __cccl_lib_allocate_at_least 202106L
// # define __cccl_lib_associative_heterogeneous_erasure 202110L
// # define __cccl_lib_bind_back 202202L
// # define __cccl_lib_byteswap 202110L
// # define __cccl_lib_constexpr_bitset 202207L
// # define __cccl_lib_constexpr_charconv 202207L
// # define __cccl_lib_constexpr_cmath 202202L
Expand Down
143 changes: 143 additions & 0 deletions libcudacxx/test/libcudacxx/std/numerics/bit/byteswap.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/std/bit>
#include <cuda/std/cassert>
#include <cuda/std/cstddef>
#include <cuda/std/cstdint>
#include <cuda/std/utility>

#include "test_macros.h"

template <class T, class = void>
struct has_byteswap : cuda::std::false_type
{};

template <class T>
struct has_byteswap<T, cuda::std::void_t<decltype(cuda::std::byteswap(cuda::std::declval<T>()))>> : cuda::std::true_type
{};

enum class Byte : cuda::std::uint8_t
{
};

static_assert(!has_byteswap<void*>::value, "");
static_assert(!has_byteswap<float>::value, "");
static_assert(!has_byteswap<char[2]>::value, "");
static_assert(!has_byteswap<Byte>::value, "");

template <class T>
struct MakeUnsigned
{
using type = cuda::std::make_unsigned_t<T>;
};

template <>
struct MakeUnsigned<bool>
{
using type = bool;
};

template <class T>
__host__ __device__ TEST_CONSTEXPR_CXX14 void test_num(T in, T expected)
{
using U = typename MakeUnsigned<T>::type;

assert(static_cast<U>(cuda::std::byteswap(in)) == static_cast<U>(expected));
ASSERT_SAME_TYPE(decltype(cuda::std::byteswap(in)), decltype(in));
ASSERT_NOEXCEPT(cuda::std::byteswap(in));
}

template <class T>
struct TestData
{
T in;
T expected;
};

template <class T>
__host__ __device__ TEST_CONSTEXPR_CXX14 TestData<T> get_test_data()
{
switch (sizeof(T))
{
case 2:
return {static_cast<T>(0x1234), static_cast<T>(0x3412)};
case 4:
return {static_cast<T>(0x60AF8503), static_cast<T>(0x0385AF60)};
case 8:
return {static_cast<T>(0xABCDFE9477936406), static_cast<T>(0x0664937794FECDAB)};
default:
assert(false);
cuda::std::unreachable();
}
}

template <class T>
__host__ __device__ TEST_CONSTEXPR_CXX14 void test_implementation_defined_size()
{
const auto test_data = get_test_data<T>();
test_num<T>(test_data.in, test_data.expected);
}

__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
{
test_num<cuda::std::uint8_t>(0xAB, 0xAB);
test_num<cuda::std::uint16_t>(0xCDEF, 0xEFCD);
test_num<cuda::std::uint32_t>(0x01234567, 0x67452301);
test_num<cuda::std::uint64_t>(0x0123456789ABCDEF, 0xEFCDAB8967452301);

test_num<cuda::std::int8_t>(static_cast<cuda::std::int8_t>(0xAB), static_cast<cuda::std::int8_t>(0xAB));
test_num<cuda::std::int16_t>(static_cast<cuda::std::int16_t>(0xCDEF), static_cast<cuda::std::int16_t>(0xEFCD));
test_num<cuda::std::int32_t>(0x01234567, 0x67452301);
// requires static_cast to silence integer conversion resulted in a change of sign warning
test_num<cuda::std::int64_t>(
static_cast<cuda::std::int64_t>(0x0123456789ABCDEF), static_cast<cuda::std::int64_t>(0xEFCDAB8967452301));

#if !defined(TEST_HAS_NO_INT128_T)
const auto in = static_cast<__uint128_t>(0x0123456789ABCDEF) << 64 | 0x13579BDF02468ACE;
const auto expected = static_cast<__uint128_t>(0xCE8A4602DF9B5713) << 64 | 0xEFCDAB8967452301;
test_num<__uint128_t>(in, expected);
test_num<__int128_t>(in, expected);
#endif // !defined(TEST_HAS_NO_INT128_T)

test_num<bool>(true, true);
test_num<bool>(false, false);
test_num<char>(static_cast<char>(0xCD), static_cast<char>(0xCD));
test_num<unsigned char>(0xEF, 0xEF);
test_num<signed char>(0x45, 0x45);
#if TEST_STD_VER >= 2020
test_num<char8_t>(0xAB, 0xAB);
#endif // TEST_STD_VER >= 2020
test_num<char16_t>(0xABCD, 0xCDAB);
test_num<char32_t>(0xABCDEF01, 0x01EFCDAB);
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
test_implementation_defined_size<wchar_t>();
#endif

test_implementation_defined_size<short>();
test_implementation_defined_size<unsigned short>();
test_implementation_defined_size<int>();
test_implementation_defined_size<unsigned int>();
test_implementation_defined_size<long>();
test_implementation_defined_size<unsigned long>();
test_implementation_defined_size<long long>();
test_implementation_defined_size<unsigned long long>();
return true;
}

int main(int, char**)
{
test();
#if TEST_STD_VER >= 2014
static_assert(test(), "");
#endif // TEST_STD_VER >= 2014

return 0;
}

0 comments on commit ffb0b1e

Please sign in to comment.