From e8d57c3d0074fcb59f7bc2c2b209c58a01c03be0 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 4 Oct 2024 09:13:49 +0700 Subject: [PATCH 1/9] [FEA]: Validate cuda.parallel type matching in build and execution (#2429) * Brute force experiment: Which tests fail after adding an `assert False`? * Do not include our own string.h file (#2444) That might conflict with the host standard library * Add `_dtype_validation()` in python/cuda_parallel/cuda/parallel/experimental/__init__.py and fix bug in python/cuda_parallel/tests/test_reduce_api.py * Add `test_device_reduce_dtype_mismatch()`. Capture `dtype`s only in ctor (not entire arrays). * Undo change in .gitignore * Move `min_op()` back into `test_device_reduce_success()` to unbreak sphinx documentation. Also fix existing typo. * Move `test_device_reduce_dtype_mismatch()` from test_reduce_api.py to test_reduce.py * Add TODO POINTER vs ITERATOR --------- Co-authored-by: Michael Schellenberger Costa --- .../cuda/parallel/experimental/__init__.py | 13 ++++++++++++- python/cuda_parallel/tests/test_reduce.py | 17 +++++++++++++++++ python/cuda_parallel/tests/test_reduce_api.py | 8 ++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py index 4a16fc1b67a..0fa2d09df11 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py +++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py @@ -184,8 +184,16 @@ class _CCCLDeviceReduceBuildResult(ctypes.Structure): ("reduction_kernel", ctypes.c_void_p)] +def _dtype_validation(dt1, dt2): + if dt1 != dt2: + raise TypeError(f"dtype mismatch: __init__={dt1}, __call__={dt2}") + + class _Reduce: def __init__(self, d_in, d_out, op, init): + self._ctor_d_in_dtype = d_in.dtype + self._ctor_d_out_dtype = d_out.dtype + self._ctor_init_dtype = init.dtype cc_major, cc_minor = cuda.get_current_device().compute_capability cub_path, thrust_path, libcudacxx_path, cuda_include_path = _get_paths() bindings = _get_bindings() @@ -212,7 +220,10 @@ def __init__(self, d_in, d_out, op, init): raise ValueError('Error building reduce') def __call__(self, temp_storage, d_in, d_out, init): - # TODO Assert that types match the ones used in the constructor + # TODO validate POINTER vs ITERATOR when iterator support is added + _dtype_validation(self._ctor_d_in_dtype, d_in.dtype) + _dtype_validation(self._ctor_d_out_dtype, d_out.dtype) + _dtype_validation(self._ctor_init_dtype, init.dtype) bindings = _get_bindings() if temp_storage is None: temp_storage_bytes = ctypes.c_size_t() diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index 9f59f8efcec..78c14b47931 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -66,3 +66,20 @@ def op(a, b): result = d_output.copy_to_host()[0] expected = numpy.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) + + +def test_device_reduce_dtype_mismatch(): + def min_op(a, b): + return a if a < b else b + + dtypes = [numpy.int32, numpy.int64] + h_inits = [numpy.array([], dt) for dt in dtypes] + h_inputs = [numpy.array([], dt) for dt in dtypes] + d_outputs = [cuda.device_array(1, dt) for dt in dtypes] + d_inputs = [cuda.to_device(h_inp) for h_inp in h_inputs] + + reduce_into = cudax.reduce_into(d_inputs[0], d_outputs[0], min_op, h_inits[0]) + + for ix in range(3): + with pytest.raises(TypeError, match=r"^dtype mismatch: __init__=int32, __call__=int64$"): + reduce_into(None, d_inputs[int(ix == 0)], d_outputs[int(ix == 1)], h_inits[int(ix == 2)]) diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py index 6ed35831218..8c63364559c 100644 --- a/python/cuda_parallel/tests/test_reduce_api.py +++ b/python/cuda_parallel/tests/test_reduce_api.py @@ -13,19 +13,19 @@ def test_device_reduce(): # example-begin reduce-min - def op(a, b): + def min_op(a, b): return a if a < b else b dtype = numpy.int32 h_init = numpy.array([42], dtype) - h_input = numpy.array([8, 6, 7, 5, 3, 0, 9]) + h_input = numpy.array([8, 6, 7, 5, 3, 0, 9], dtype) d_output = cuda.device_array(1, dtype) d_input = cuda.to_device(h_input) # Instantiate reduction for the given operator and initial value - reduce_into = cudax.reduce_into(d_output, d_output, op, h_init) + reduce_into = cudax.reduce_into(d_output, d_output, min_op, h_init) - # Deterrmine temporary device storage requirements + # Determine temporary device storage requirements temp_storage_size = reduce_into(None, d_input, d_output, h_init) # Allocate temporary storage From 583567bc90a3c3df6094f2ad5d64de451fc645c5 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Fri, 4 Oct 2024 12:49:39 -0700 Subject: [PATCH 2/9] avoid gcc optimizer bug by not force inlining part of `thrust::transform` (#2509) --- thrust/thrust/system/cuda/detail/transform.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 9e1d0b2a318..1926fb62473 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -178,8 +178,10 @@ struct binary_transform_f -OutputIt THRUST_FUNCTION unary( +OutputIt _CCCL_HOST_DEVICE inline unary( Policy& policy, InputIt items, OutputIt result, @@ -200,6 +202,8 @@ OutputIt THRUST_FUNCTION unary( return result + num_items; } +// EAN 2024-10-04: when force-inlined, gcc's optimizer will generate bad code +// for this function: template -OutputIt THRUST_FUNCTION binary( +OutputIt _CCCL_HOST_DEVICE inline binary( Policy& policy, InputIt1 items1, InputIt2 items2, From c86cacae9b0e0b189c963466d68fec0fe69c0a88 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Sat, 5 Oct 2024 12:37:19 +0200 Subject: [PATCH 3/9] Cleanup and modularize `` (#2443) --- .../include/cuda/__barrier/aligned_size.h | 45 + .../__barrier/async_contract_fulfillment.h | 35 + libcudacxx/include/cuda/__barrier/barrier.h | 62 + .../cuda/__barrier/barrier_arrive_tx.h | 94 ++ .../cuda/__barrier/barrier_block_scope.h | 465 ++++++ .../cuda/__barrier/barrier_expect_tx.h | 70 + .../cuda/__barrier/barrier_native_handle.h | 37 + .../cuda/__barrier/barrier_thread_scope.h | 57 + libcudacxx/include/cuda/__fwd/barrier.h | 34 + .../cuda/__fwd/barrier_native_handle.h | 38 + libcudacxx/include/cuda/__fwd/pipeline.h | 33 + .../__memcpy_async/completion_mechanism.h | 43 + .../cp_async_bulk_shared_global.h | 56 + .../cuda/__memcpy_async/cp_async_fallback.h | 68 + .../__memcpy_async/cp_async_shared_global.h | 102 ++ .../__memcpy_async/dispatch_memcpy_async.h | 157 ++ .../__memcpy_async/is_local_smem_barrier.h | 45 + .../cuda/__memcpy_async/memcpy_async.h | 166 +++ .../__memcpy_async/memcpy_async_barrier.h | 118 ++ .../cuda/__memcpy_async/memcpy_async_tx.h | 89 ++ .../cuda/__memcpy_async/memcpy_completion.h | 168 +++ .../__memcpy_async/try_get_barrier_handle.h | 54 + libcudacxx/include/cuda/barrier | 16 + .../include/cuda/std/__barrier/barrier.h | 228 +++ .../cuda/std/__barrier/empty_completion.h | 33 + .../include/cuda/std/__barrier/poll_tester.h | 80 + libcudacxx/include/cuda/std/__cuda/barrier.h | 1301 ----------------- libcudacxx/include/cuda/std/barrier | 31 +- .../cuda/std/detail/libcxx/include/__config | 4 - .../detail/libcxx/include/__threading_support | 6 - .../cuda/std/detail/libcxx/include/barrier | 459 ------ 31 files changed, 2415 insertions(+), 1779 deletions(-) create mode 100644 libcudacxx/include/cuda/__barrier/aligned_size.h create mode 100644 libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier_block_scope.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier_expect_tx.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier_native_handle.h create mode 100644 libcudacxx/include/cuda/__barrier/barrier_thread_scope.h create mode 100644 libcudacxx/include/cuda/__fwd/barrier.h create mode 100644 libcudacxx/include/cuda/__fwd/barrier_native_handle.h create mode 100644 libcudacxx/include/cuda/__fwd/pipeline.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h create mode 100644 libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h create mode 100644 libcudacxx/include/cuda/std/__barrier/barrier.h create mode 100644 libcudacxx/include/cuda/std/__barrier/empty_completion.h create mode 100644 libcudacxx/include/cuda/std/__barrier/poll_tester.h delete mode 100644 libcudacxx/include/cuda/std/__cuda/barrier.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/barrier diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h new file mode 100644 index 00000000000..f0b863af76f --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/aligned_size.h @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H +#define _CUDA___BARRIER_ALIGNED_SIZE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template <_CUDA_VSTD::size_t _Alignment> +struct aligned_size_t +{ + static constexpr _CUDA_VSTD::size_t align = _Alignment; + _CUDA_VSTD::size_t value; + + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s) + : value(__s) + {} + _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const + { + return value; + } +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_ALIGNED_SIZE_H diff --git a/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h b/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h new file mode 100644 index 00000000000..57e54f0b692 --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H +#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +// Type only used for logging purpose +enum async_contract_fulfillment +{ + none, + async +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H diff --git a/libcudacxx/include/cuda/__barrier/barrier.h b/libcudacxx/include/cuda/__barrier/barrier.h new file mode 100644 index 00000000000..87bbff7ba50 --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier.h @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_BARRIER_H +#define _CUDA___BARRIER_BARRIER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco> +{ +public: + _CCCL_HIDE_FROM_ABI barrier() = default; + + barrier(const barrier&) = delete; + barrier& operator=(const barrier&) = delete; + + _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected, + _CompletionF __completion = _CompletionF()) + : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion) + {} + + _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected) + { + _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); + new (__b) barrier(__expected); + } + + _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion) + { + _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); + new (__b) barrier(__expected, __completion); + } +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_BARRIER_H diff --git a/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h b/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h new file mode 100644 index 00000000000..10fe5e1452c --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h @@ -0,0 +1,94 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_ +#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) +# if __cccl_ptx_isa >= 800 + +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE + +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); +_CCCL_NODISCARD _CCCL_DEVICE inline barrier::arrival_token barrier_arrive_tx( + barrier& __b, + _CUDA_VSTD::ptrdiff_t __arrive_count_update, + _CUDA_VSTD::ptrdiff_t __transaction_count_update) +{ + _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one."); + _CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1."); + _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + barrier::arrival_token __token = {}; + // On architectures pre-sm90, arrive_tx is not supported. + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + + auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle); + if (__arrive_count_update == 1) { + __token = _CUDA_VPTX::mbarrier_arrive_expect_tx( + _CUDA_VPTX::sem_release, + _CUDA_VPTX::scope_cta, + _CUDA_VPTX::space_shared, + __native_handle, + __transaction_count_update); + } else { + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive( + _CUDA_VPTX::sem_release, + _CUDA_VPTX::scope_cta, + _CUDA_VPTX::space_shared, + __native_handle, + __arrive_count_update); + }), + (__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();)); + return __token; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE + +# endif // __cccl_ptx_isa >= 800 +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_ diff --git a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h new file mode 100644 index 00000000000..e794b7046fa --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h @@ -0,0 +1,465 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H +#define _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#if defined(_CCCL_CUDA_COMPILER) +# include +# include +# include +#endif // _CCCL_CUDA_COMPILER +#include +#include +#include +#include +#include +#include +#include + +#include + +#if defined(_CCCL_COMPILER_NVRTC) +# define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type*) 0)->member)) +#else // ^^^ _CCCL_COMPILER_NVRTC ^^^ / vvv !_CCCL_COMPILER_NVRTC vvv +# define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !offsetof(type, member) +#endif // _CCCL_COMPILER_NVRTC + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +// Needed for pipeline.arrive_on +struct __block_scope_barrier_base +{}; + +template <> +class barrier : public __block_scope_barrier_base +{ + using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>; + __barrier_base __barrier; + + _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t* + device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(barrier& b); + + template + friend class _CUDA_VSTD::__barrier_poll_tester_phase; + template + friend class _CUDA_VSTD::__barrier_poll_tester_parity; + +public: + using arrival_token = typename __barrier_base::arrival_token; + _CCCL_HIDE_FROM_ABI barrier() = default; + + barrier(const barrier&) = delete; + barrier& operator=(const barrier&) = delete; + + _LIBCUDACXX_HIDE_FROM_ABI barrier(_CUDA_VSTD::ptrdiff_t __expected, + _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion()) + { + static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier, __barrier), + "fatal error: bad barrier layout"); + init(this, __expected, __completion); + } + + _LIBCUDACXX_HIDE_FROM_ABI ~barrier() + { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + if (__isShared(&__barrier)) { + asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( + __cvta_generic_to_shared(&__barrier))) + : "memory"); + } else if (__isClusterShared(&__barrier)) { __trap(); }), + NV_PROVIDES_SM_80, + (if (__isShared(&__barrier)) { + asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( + __cvta_generic_to_shared(&__barrier))) + : "memory"); + })) + } + + _LIBCUDACXX_HIDE_FROM_ABI friend void init( + barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CUDA_VSTD::__empty_completion = _CUDA_VSTD::__empty_completion()) + { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + if (__isShared(&__b->__barrier)) { + asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"( + static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected)) + : "memory"); + } else if (__isClusterShared(&__b->__barrier)) { __trap(); } else { + new (&__b->__barrier) __barrier_base(__expected); + }), + NV_PROVIDES_SM_80, + ( + if (__isShared(&__b->__barrier)) { + asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"( + static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected)) + : "memory"); + } else { new (&__b->__barrier) __barrier_base(__expected); }), + NV_ANY_TARGET, + (new (&__b->__barrier) __barrier_base(__expected);)) + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1) + { + _CCCL_ASSERT(__update >= 0, "Arrival count update must be non-negative."); + arrival_token __token = {}; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + if (!__isClusterShared(&__barrier)) { return __barrier.arrive(__update); } else if (!__isShared(&__barrier)) { + __trap(); + } + // Cannot use cuda::device::barrier_native_handle here, as it is + // only defined for block-scope barriers. This barrier may be a + // non-block scoped barrier. + auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); + __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);), + NV_PROVIDES_SM_80, + ( + if (!__isShared(&__barrier)) { + return __barrier.arrive(__update); + } auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); + // Need 2 instructions, can't finish barrier with arrive > 1 + if (__update > 1) { _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } __token = + _CUDA_VPTX::mbarrier_arrive(__bh);), + NV_PROVIDES_SM_70, + ( + if (!__isShared(&__barrier)) { return __barrier.arrive(__update); } + + unsigned int __mask = __activemask(); + unsigned int __activeA = __match_any_sync(__mask, __update); + unsigned int __activeB = __match_any_sync(__mask, reinterpret_cast<_CUDA_VSTD::uintptr_t>(&__barrier)); + unsigned int __active = __activeA & __activeB; + int __inc = __popc(__active) * __update; + + unsigned __laneid; + asm("mov.u32 %0, %%laneid;" + : "=r"(__laneid)); + int __leader = __ffs(__active) - 1; + // All threads in mask synchronize here, establishing cummulativity to the __leader: + __syncwarp(__mask); + if (__leader == static_cast(__laneid)) { + __token = __barrier.arrive(__inc); + } __token = __shfl_sync(__active, __token, __leader);), + NV_IS_HOST, + (__token = __barrier.arrive(__update);)) + return __token; + } + +private: + _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_sm_80(arrival_token __token) const + { + (void) __token; + int32_t __ready = 0; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_80, + (asm volatile("{\n\t" + ".reg .pred p;\n\t" + "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token) + : "memory");)) + return __ready; + } + + // Document de drop > uint32_t for __nanosec on public for APIs + _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token) const + { + (void) __token; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + int32_t __ready = 0; if (!__isClusterShared(&__barrier)) { + return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token)); + } else if (!__isShared(&__barrier)) { + __trap(); + } asm volatile("{\n\t" + ".reg .pred p;\n\t" + "mbarrier.try_wait.shared.b64 p, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token) + : "memory"); + return __ready;), + NV_PROVIDES_SM_80, + (if (!__isShared(&__barrier)) { + return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token)); + } return __test_wait_sm_80(__token);), + NV_ANY_TARGET, + (return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));)) + } + + // Document de drop > uint32_t for __nanosec on public for APIs + _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const + { + if (__nanosec.count() < 1) + { + return __try_wait(_CUDA_VSTD::move(__token)); + } + + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + int32_t __ready = 0; + if (!__isClusterShared(&__barrier)) { + return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); + } else if (!__isShared(&__barrier)) { __trap(); } + + _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = + _CUDA_VSTD::chrono::high_resolution_clock::now(); + _CUDA_VSTD::chrono::nanoseconds __elapsed; + do { + const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count()); + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), + "l"(__token), + "r"(__wait_nsec) + : "memory"); + __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start; + } while (!__ready && (__nanosec > __elapsed)); + return __ready;), + NV_PROVIDES_SM_80, + ( + bool __ready = 0; + if (!__isShared(&__barrier)) { + return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); + } + + _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = + _CUDA_VSTD::chrono::high_resolution_clock::now(); + do { + __ready = __test_wait_sm_80(__token); + } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start)); + return __ready;), + NV_ANY_TARGET, + (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), + _CUDA_VSTD::chrono::nanoseconds(__nanosec));)) + } + + _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_parity_sm_80(bool __phase_parity) const + { + (void) __phase_parity; + uint16_t __ready = 0; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_80, + (asm volatile( + "{" + ".reg .pred %%p;" + "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;" + "selp.u16 %0, 1, 0, %%p;" + "}" + : "=h"(__ready) + : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__phase_parity)) + : "memory");)) + return __ready; + } + + _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity) const + { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + if (!__isClusterShared(&__barrier)) { + return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); + } else if (!__isShared(&__barrier)) { __trap(); } int32_t __ready = 0; + + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)) + :); + + return __ready;), + NV_PROVIDES_SM_80, + (if (!__isShared(&__barrier)) { return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); } + + return __test_wait_parity_sm_80(__phase_parity);), + NV_ANY_TARGET, + (return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);)) + } + + _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const + { + if (__nanosec.count() < 1) + { + return __try_wait_parity(__phase_parity); + } + + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + int32_t __ready = 0; + if (!__isClusterShared(&__barrier)) { + return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); + } else if (!__isShared(&__barrier)) { __trap(); } + + _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = + _CUDA_VSTD::chrono::high_resolution_clock::now(); + _CUDA_VSTD::chrono::nanoseconds __elapsed; + do { + const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count()); + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)), + "r"(__wait_nsec) + : "memory"); + __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start; + } while (!__ready && (__nanosec > __elapsed)); + + return __ready;), + NV_PROVIDES_SM_80, + ( + bool __ready = 0; + if (!__isShared(&__barrier)) { + return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); + } + + _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = + _CUDA_VSTD::chrono::high_resolution_clock::now(); + do { + __ready = __test_wait_parity_sm_80(__phase_parity); + } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start)); + + return __ready;), + NV_ANY_TARGET, + (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec);)) + } + +public: + _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const + { + _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__phase))); + } + + _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const + { + _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity)); + } + + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() + { + wait(arrive()); + } + + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() + { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_90, + ( + if (!__isClusterShared(&__barrier)) { return __barrier.arrive_and_drop(); } else if (!__isShared(&__barrier)) { + __trap(); + } + + asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( + __cvta_generic_to_shared(&__barrier))) + : "memory");), + NV_PROVIDES_SM_80, + ( + // Fallback to slowpath on device + if (!__isShared(&__barrier)) { + __barrier.arrive_and_drop(); + return; + } + + asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( + __cvta_generic_to_shared(&__barrier))) + : "memory");), + NV_ANY_TARGET, + ( + // Fallback to slowpath on device + __barrier.arrive_and_drop();)) + } + + _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept + { + return (1 << 20) - 1; + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool + try_wait_for(arrival_token&& __token, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur) + { + auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur); + + return __try_wait(_CUDA_VSTD::move(__token), __nanosec); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool + try_wait_until(arrival_token&& __token, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time) + { + return try_wait_for(_CUDA_VSTD::move(__token), (__time - _Clock::now())); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool + try_wait_parity_for(bool __phase_parity, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur) + { + auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur); + + return __try_wait_parity(__phase_parity, __nanosec); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool + try_wait_parity_until(bool __phase_parity, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time) + { + return try_wait_parity_for(__phase_parity, (__time - _Clock::now())); + } +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_BARRIER_BLOCK_SCOPE_H diff --git a/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h new file mode 100644 index 00000000000..e86b0e2d400 --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h @@ -0,0 +1,70 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_BARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_BARRIER_EXPECT_TX_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) +# if __cccl_ptx_isa >= 800 + +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE + +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); +_CCCL_DEVICE inline void +barrier_expect_tx(barrier& __b, _CUDA_VSTD::ptrdiff_t __transaction_count_update) +{ + _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + // On architectures pre-sm90, arrive_tx is not supported. + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory");), + (__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE + +# endif // __cccl_ptx_isa >= 800 +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX_BARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__barrier/barrier_native_handle.h b/libcudacxx/include/cuda/__barrier/barrier_native_handle.h new file mode 100644 index 00000000000..29879c71edf --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier_native_handle.h @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H +#define _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE + +_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier& __b) +{ + return reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__b.__barrier); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE + +#endif // _CUDA___BARRIER_BARRIER_NATIVE_HANDLE_H diff --git a/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h b/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h new file mode 100644 index 00000000000..aa87dfa4b94 --- /dev/null +++ b/libcudacxx/include/cuda/__barrier/barrier_thread_scope.h @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H +#define _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template <> +class barrier : private barrier +{ + using __base = barrier; + +public: + using __base::__base; + + _LIBCUDACXX_HIDE_FROM_ABI friend void + init(barrier* __b, + _CUDA_VSTD::ptrdiff_t __expected, + _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion()) + { + init(static_cast<__base*>(__b), __expected, __completion); + } + + using __base::arrive; + using __base::arrive_and_drop; + using __base::arrive_and_wait; + using __base::max; + using __base::wait; +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_BARRIER_THREAD_SCOPE_H diff --git a/libcudacxx/include/cuda/__fwd/barrier.h b/libcudacxx/include/cuda/__fwd/barrier.h new file mode 100644 index 00000000000..c2bc80929f9 --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/barrier.h @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_BARRIER_H +#define _CUDA___FWD_BARRIER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +class barrier; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___FWD_BARRIER_H diff --git a/libcudacxx/include/cuda/__fwd/barrier_native_handle.h b/libcudacxx/include/cuda/__fwd/barrier_native_handle.h new file mode 100644 index 00000000000..2b024f559ff --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/barrier_native_handle.h @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_BARRIER_NATIVE_HANDLE_H +#define _CUDA___FWD_BARRIER_NATIVE_HANDLE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +_CCCL_NV_DIAG_SUPPRESS(821) // extern inline function was referenced but not defined + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE + +_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier& __b); + +_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE + +_CCCL_NV_DIAG_DEFAULT(821) + +#endif // _CUDA___FWD_BARRIER_NATIVE_HANDLE_H diff --git a/libcudacxx/include/cuda/__fwd/pipeline.h b/libcudacxx/include/cuda/__fwd/pipeline.h new file mode 100644 index 00000000000..02ec295da44 --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/pipeline.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_PIPELINE_H +#define _CUDA___FWD_PIPELINE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +class pipeline; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___FWD_PIPELINE_H diff --git a/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h b/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h new file mode 100644 index 00000000000..1564e00a092 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/completion_mechanism.h @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_COMPLETION_MECHANISM_H +#define _CUDA___BARRIER_COMPLETION_MECHANISM_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! @brief __completion_mechanism allows memcpy_async to report back what completion +//! mechanism it used. This is necessary to determine in which way to synchronize +//! the memcpy_async with a sync object (barrier or pipeline). +// +//! In addition, we use this enum to create bit flags so that calling functions +//! can specify which completion mechanisms can be used (__sync is always +//! allowed). +enum class __completion_mechanism +{ + __sync = 0, + __mbarrier_complete_tx = 1 << 0, // Use powers of two here to support the + __async_group = 1 << 1, // bit flag use case + __async_bulk_group = 1 << 2, +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_COMPLETION_MECHANISM_H diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h new file mode 100644 index 00000000000..94f11bf76f8 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h @@ -0,0 +1,56 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) +# if __cccl_ptx_isa >= 800 + +# include +# include +# include +# include + +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); +template +inline __device__ void __cp_async_bulk_shared_global( + const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle) +{ + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + (if (__g.thread_rank() == 0) { + _CUDA_VPTX::cp_async_bulk( + _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __src, __size, __bar_handle); + }), + (__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +# endif // __cccl_ptx_isa >= 800 +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_BULK_SHARED_GLOBAL_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h new file mode 100644 index 00000000000..1be497829cc --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_fallback.h @@ -0,0 +1,68 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template <_CUDA_VSTD::size_t _Copy_size> +struct __copy_chunk +{ + _CCCL_ALIGNAS(_Copy_size) char data[_Copy_size]; +}; + +template <_CUDA_VSTD::size_t _Alignment, typename _Group> +inline _CCCL_HOST_DEVICE void +__cp_async_fallback_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) +{ + // Maximal copy size is 16 bytes + constexpr _CUDA_VSTD::size_t __copy_size = (_Alignment > 16) ? 16 : _Alignment; + + using __chunk_t = __copy_chunk<__copy_size>; + + // "Group"-strided loop over memory + const _CUDA_VSTD::size_t __stride = __g.size() * __copy_size; + + // An unroll factor of 64 ought to be enough for anybody. This unroll pragma + // is mainly intended to place an upper bound on loop unrolling. The number + // is more than high enough for the intended use case: an unroll factor of + // 64 allows moving 4 * 64 * 256 = 64kb in one unrolled loop with 256 + // threads (copying ints). On the other hand, in the unfortunate case that + // we have to move 1024 bytes / thread with char width, then we prevent + // fully unrolling the loop to 1024 copy instructions. This prevents the + // compile times from increasing unreasonably, and also has neglibible + // impact on runtime performance. + _LIBCUDACXX_PRAGMA_UNROLL(64) + for (_CUDA_VSTD::size_t __offset = __g.thread_rank() * __copy_size; __offset < __size; __offset += __stride) + { + __chunk_t tmp = *reinterpret_cast(__src + __offset); + *reinterpret_cast<__chunk_t*>(__dest + __offset) = tmp; + } +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_FALLBACK_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h new file mode 100644 index 00000000000..2266d5c96cc --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h @@ -0,0 +1,102 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) + +# include +# include +# include + +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); +template +inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) +{ + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + + // If `if constexpr` is not available, this function gets instantiated even + // if is not called. Do not static_assert in that case. +# if _CCCL_STD_VER >= 2017 + static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, + "cp.async.shared.global requires a copy size of 4, 8, or 16."); +# endif // _CCCL_STD_VER >= 2017 + + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), + "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), + "n"(_Copy_size) + : "memory");), + (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); +} + +template <> +inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src) +{ + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + // When copying 16 bytes, it is possible to skip L1 cache (.cg). + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), + "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), + "n"(16) + : "memory");), + (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); +} + +template +inline __device__ void +__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) +{ + // If `if constexpr` is not available, this function gets instantiated even + // if is not called. Do not static_assert in that case. +# if _CCCL_STD_VER >= 2017 + static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment"); +# endif // _CCCL_STD_VER >= 2017 + + // Maximal copy size is 16. + constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment; + // We use an int offset here, because we are copying to shared memory, + // which is easily addressable using int. + const int __group_size = __g.size(); + const int __group_rank = __g.thread_rank(); + const int __stride = __group_size * __copy_size; + for (int __offset = __group_rank * __copy_size; __offset < static_cast(__size); __offset += __stride) + { + __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset); + } +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h new file mode 100644 index 00000000000..cb8fcb69083 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h @@ -0,0 +1,157 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +/*********************************************************************** + * cuda::memcpy_async dispatch + * + * The dispatch mechanism takes all the arguments and dispatches to the + * fastest asynchronous copy mechanism available. + * + * It returns a __completion_mechanism that indicates which completion mechanism + * was used by the copy mechanism. This value can be used by the sync object to + * further synchronize if necessary. + * + ***********************************************************************/ + +template <_CUDA_VSTD::size_t _Align, typename _Group> +_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_any_to_any( + _Group const& __group, + char* __dest_char, + char const* __src_char, + _CUDA_VSTD::size_t __size, + _CUDA_VSTD::uint32_t __allowed_completions, + _CUDA_VSTD::uint64_t* __bar_handle) +{ + __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size); + return __completion_mechanism::__sync; +} + +template <_CUDA_VSTD::size_t _Align, typename _Group> +_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared( + _Group const& __group, + char* __dest_char, + char const* __src_char, + _CUDA_VSTD::size_t __size, + _CUDA_VSTD::uint32_t __allowed_completions, + _CUDA_VSTD::uint64_t* __bar_handle) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + (const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx); + (void) __can_use_complete_tx; + _CCCL_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), + "Pass non-null bar_handle if and only if can_use_complete_tx."); + _CCCL_IF_CONSTEXPR (_Align >= 16) { + if (__can_use_complete_tx && __isShared(__bar_handle)) + { + __cp_async_bulk_shared_global(__group, __dest_char, __src_char, __size, __bar_handle); + return __completion_mechanism::__mbarrier_complete_tx; + } + } + // Fallthrough to SM 80.. + )); +#endif // __cccl_ptx_isa >= 800 + + NV_IF_TARGET( + NV_PROVIDES_SM_80, + (_CCCL_IF_CONSTEXPR (_Align >= 4) { + const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group); + if (__can_use_async_group) + { + __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size); + return __completion_mechanism::__async_group; + } + } + // Fallthrough.. + )); + + __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size); + return __completion_mechanism::__sync; +} + +// __dispatch_memcpy_async is the internal entry point for dispatching to the correct memcpy_async implementation. +template <_CUDA_VSTD::size_t _Align, typename _Group> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async( + _Group const& __group, + char* __dest_char, + char const* __src_char, + _CUDA_VSTD::size_t __size, + _CUDA_VSTD::uint32_t __allowed_completions, + _CUDA_VSTD::uint64_t* __bar_handle) +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, + ( + // Dispatch based on direction of the copy: global to shared, shared to + // global, etc. + + // CUDA compilers <= 12.2 may not propagate assumptions about the state space + // of pointers correctly. Therefore, we + // 1) put the code for each copy direction in a separate function, and + // 2) make sure none of the code paths can reach each other by "falling through". + // + // See nvbug 4074679 and also PR #478. + if (__isGlobal(__src_char) && __isShared(__dest_char)) { + return __dispatch_memcpy_async_global_to_shared<_Align>( + __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle); + } else { + return __dispatch_memcpy_async_any_to_any<_Align>( + __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle); + }), + ( + // Host code path: + if (__group.thread_rank() == 0) { + memcpy(__dest_char, __src_char, __size); + } return __completion_mechanism::__sync;)); +} + +template <_CUDA_VSTD::size_t _Align, typename _Group> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async( + _Group const& __group, + char* __dest_char, + char const* __src_char, + _CUDA_VSTD::size_t __size, + _CUDA_VSTD::uint32_t __allowed_completions) +{ + _CCCL_ASSERT(!(__allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx)), + "Cannot allow mbarrier_complete_tx completion mechanism when not passing a barrier. "); + return __dispatch_memcpy_async<_Align>(__group, __dest_char, __src_char, __size, __allowed_completions, nullptr); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA_PTX__MEMCPY_ASYNC_DISPATCH_MEMCPY_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h new file mode 100644 index 00000000000..c130d8c6736 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H +#define _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! @brief __is_local_smem_barrier returns true if barrier is (1) block-scoped and (2) located in shared memory. +template +_LIBCUDACXX_HIDE_FROM_ABI bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier) +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return _Is_mbarrier && __isShared(&__barrier);), (return false;)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_IS_LOCAL_SMEM_BARRIER_H diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h new file mode 100644 index 00000000000..3dc74bc5d96 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h @@ -0,0 +1,166 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +/*********************************************************************** + * memcpy_async code: + * + * A call to cuda::memcpy_async(dest, src, size, barrier) can dispatch to any of + * these PTX instructions: + * + * 1. normal synchronous copy (fallback) + * 2. cp.async: shared <- global + * 3. cp.async.bulk: shared <- global + * 4. TODO: cp.async.bulk: global <- shared + * 5. TODO: cp.async.bulk: cluster <- shared + * + * Which of these options is chosen, depends on: + * + * 1. The alignment of dest, src, and size; + * 2. The direction of the copy + * 3. The current compute capability + * 4. The requested completion mechanism + * + * PTX has 3 asynchronous completion mechanisms: + * + * 1. Async group - local to a thread. Used by cp.async + * 2. Bulk async group - local to a thread. Used by cp.async.bulk (shared -> global) + * 3. mbarrier::complete_tx - shared memory barier. Used by cp.async.bulk (other directions) + * + * The code is organized as follows: + * + * 1. Asynchronous copy mechanisms that wrap the PTX instructions + * + * 2. Device memcpy_async implementation per copy direction (global to shared, + * shared to global, etc). Dispatches to fastest mechanism based on requested + * completion mechanism(s), pointer alignment, and architecture. + * + * 3. Host and device memcpy_async implementations. Host implementation is + * basically a memcpy wrapper; device implementation dispatches based on the + * direction of the copy. + * + * 4. __memcpy_async_barrier: + * a) Sets the allowed completion mechanisms based on the barrier location + * b) Calls the host or device memcpy_async implementation + * c) If necessary, synchronizes with the barrier based on the returned + * completion mechanism. + * + * 5. The public memcpy_async function overloads. Call into + * __memcpy_async_barrier. + * + ***********************************************************************/ + +/*********************************************************************** + * Asynchronous copy mechanisms: + * + * 1. cp.async.bulk: shared <- global + * 2. TODO: cp.async.bulk: cluster <- shared + * 3. TODO: cp.async.bulk: global <- shared + * 4. cp.async: shared <- global + * 5. normal synchronous copy (fallback) + ***********************************************************************/ + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( + _Group const& __group, + _Tp* __destination, + _Tp const* __source, + aligned_size_t<_Alignment> __size, + barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment +memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( + _Group const& __group, + _Tp* __destination, + _Tp const* __source, + _CUDA_VSTD::size_t __size, + barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( + _Group const& __group, + void* __destination, + void const* __source, + _CUDA_VSTD::size_t __size, + barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier( + __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( + _Group const& __group, + void* __destination, + void const* __source, + aligned_size_t<_Alignment> __size, + barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier( + __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment +memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) +{ + return __memcpy_async_barrier( + __single_thread_group{}, + reinterpret_cast(__destination), + reinterpret_cast(__source), + __size, + __barrier); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h new file mode 100644 index 00000000000..ed9c68ad5a3 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h @@ -0,0 +1,118 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +/*********************************************************************** + * cuda::memcpy_async dispatch helper functions + * + * - __get_size_align struct to determine the alignment from a size type. + ***********************************************************************/ + +// The __get_size_align struct provides a way to query the guaranteed +// "alignment" of a provided size. In this case, an n-byte aligned size means +// that the size is a multiple of n. +// +// Use as follows: +// static_assert(__get_size_align::align == 1) +// static_assert(__get_size_align>::align == n) + +// Default impl: always returns 1. +template +struct __get_size_align +{ + static constexpr int align = 1; +}; + +// aligned_size_t overload: return n. +template +struct __get_size_align> +{ + static constexpr int align = T::align; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct __single_thread_group +{ + _LIBCUDACXX_HIDE_FROM_ABI void sync() const {} + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t size() const + { + return 1; + }; + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t thread_rank() const + { + return 0; + }; +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier( + _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) +{ + static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_trivially_copyable, _Tp), "memcpy_async requires a trivially copyable type"); + + // 1. Determine which completion mechanisms can be used with the current + // barrier. A local shared memory barrier, i.e., block-scope barrier in local + // shared memory, supports the mbarrier_complete_tx mechanism in addition to + // the async group mechanism. + _CUDA_VSTD::uint32_t __allowed_completions = + __is_local_smem_barrier(__barrier) + ? (_CUDA_VSTD::uint32_t(__completion_mechanism::__async_group) + | _CUDA_VSTD::uint32_t(__completion_mechanism::__mbarrier_complete_tx)) + : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); + + // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. + constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align; + constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); + // Cast to char pointers. We don't need the type for alignment anymore and + // erasing the types reduces the number of instantiations of down-stream + // functions. + char* __dest_char = reinterpret_cast(__destination); + char const* __src_char = reinterpret_cast(__source); + + // 2. Issue actual copy instructions. + auto __bh = __try_get_barrier_handle(__barrier); + auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions, __bh); + + // 3. Synchronize barrier with copy instructions. + return __memcpy_completion_impl::__defer(__cm, __group, __size, __barrier); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_BARRIER_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h new file mode 100644 index 00000000000..5f242b8cf1c --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h @@ -0,0 +1,89 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_ +#define _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) +# if __cccl_ptx_isa >= 800 + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE + +extern "C" _CCCL_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx( + _Tp* __dest, + const _Tp* __src, + ::cuda::aligned_size_t<_Alignment> __size, + ::cuda::barrier<::cuda::thread_scope_block>& __b) +{ + // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are + // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with + // memcpy_async when compiling with GCC 4.8. + // FIXME: remove the #if once GCC 4.8 is no longer supported. +# if !defined(_CCCL_COMPILER_GCC) || _GNUC_VER > 408 + static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type"); +# endif + static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); + + _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory."); + _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory."); + + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + if (__isShared(__dest) && __isGlobal(__src)) { + _CUDA_VPTX::cp_async_bulk( + _CUDA_VPTX::space_cluster, + _CUDA_VPTX::space_global, + __dest, + __src, + static_cast(__size), + barrier_native_handle(__b)); + } else { + // memcpy_async_tx only supports copying from global to shared + // or from shared to remote cluster dsmem. To copy to remote + // dsmem, we need to arrive on a cluster-scoped barrier, which + // is not yet implemented. So we trap in this case as well. + _CCCL_UNREACHABLE(); + }), + (__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();)); + + return async_contract_fulfillment::async; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE + +# endif // __cccl_ptx_isa >= 800 +#endif // _CCCL_CUDA_COMPILER + +#endif // _CUDA_PTX__MEMCPY_ASYNC_MEMCPY_ASYNC_TX_H_ diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h new file mode 100644 index 00000000000..9d9ea265da0 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h @@ -0,0 +1,168 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H +#define _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_CCCL_CUDA_COMPILER) +# include +# include +#endif // _CCCL_CUDA_COMPILER + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +// This struct contains functions to defer the completion of a barrier phase +// or pipeline stage until a specific memcpy_async operation *initiated by +// this thread* has completed. + +// The user is still responsible for arriving and waiting on (or otherwise +// synchronizing with) the barrier or pipeline barrier to see the results of +// copies from other threads participating in the synchronization object. +struct __memcpy_completion_impl +{ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment + __defer(__completion_mechanism __cm, + _Group const& __group, + _CUDA_VSTD::size_t __size, + barrier<::cuda::thread_scope_block>& __barrier) + { + // In principle, this is the overload for shared memory barriers. However, a + // block-scope barrier may also be located in global memory. Therefore, we + // check if the barrier is a non-smem barrier and handle that separately. + if (!__is_local_smem_barrier(__barrier)) + { + return __defer_non_smem_barrier(__cm, __group, __size, __barrier); + } + + switch (__cm) + { + case __completion_mechanism::__async_group: + // Pre-SM80, the async_group mechanism is not available. + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // Non-Blocking: unbalance barrier by 1, barrier will be + // rebalanced when all thread-local cp.async instructions + // have completed writing to shared memory. + _CUDA_VSTD::uint64_t* __bh = __try_get_barrier_handle(__barrier); + + asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( + __cvta_generic_to_shared(__bh))) + : "memory");)); + return async_contract_fulfillment::async; + case __completion_mechanism::__async_bulk_group: + // This completion mechanism should not be used with a shared + // memory barrier. Or at least, we do not currently envision + // bulk group to be used with shared memory barriers. + _CCCL_UNREACHABLE(); + case __completion_mechanism::__mbarrier_complete_tx: +#if __cccl_ptx_isa >= 800 + // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available. + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // Only perform the expect_tx operation with the leader thread + if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); })); +#endif // __cccl_ptx_isa >= 800 + return async_contract_fulfillment::async; + case __completion_mechanism::__sync: + // sync: In this case, we do not need to do anything. The user will have + // to issue `bar.arrive_wait();` to see the effect of the transaction. + return async_contract_fulfillment::none; + default: + // Get rid of "control reaches end of non-void function": + _CCCL_UNREACHABLE(); + } + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer( + __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) + { + return __defer_non_smem_barrier(__cm, __group, __size, __barrier); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer_non_smem_barrier( + __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) + { + // Overload for non-smem barriers. + switch (__cm) + { + case __completion_mechanism::__async_group: + // Pre-SM80, the async_group mechanism is not available. + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // Blocking: wait for all thread-local cp.async instructions to have + // completed writing to shared memory. + asm volatile("cp.async.wait_all;" :: + : "memory");)); + return async_contract_fulfillment::async; + case __completion_mechanism::__mbarrier_complete_tx: + // Non-smem barriers do not have an mbarrier_complete_tx mechanism.. + _CCCL_UNREACHABLE(); + case __completion_mechanism::__async_bulk_group: + // This completion mechanism is currently not expected to be used with barriers. + _CCCL_UNREACHABLE(); + case __completion_mechanism::__sync: + // sync: In this case, we do not need to do anything. + return async_contract_fulfillment::none; + default: + // Get rid of "control reaches end of non-void function": + _CCCL_UNREACHABLE(); + } + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment + __defer(__completion_mechanism __cm, _Group const&, _CUDA_VSTD::size_t, pipeline<_Sco>&) + { + switch (__cm) + { + case __completion_mechanism::__async_group: + return async_contract_fulfillment::async; + case __completion_mechanism::__async_bulk_group: + return async_contract_fulfillment::async; + case __completion_mechanism::__mbarrier_complete_tx: + return async_contract_fulfillment::async; + case __completion_mechanism::__sync: + return async_contract_fulfillment::none; + default: + // Get rid of "control reaches end of non-void function": + _CCCL_UNREACHABLE(); + } + } +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___MEMCPY_ASYNC_MEMCPY_COMPLETION_H diff --git a/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h new file mode 100644 index 00000000000..d2207faf91d --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H +#define _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! @brief __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise. +template +_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier) +{ + return nullptr; +} + +template <> +_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* +__try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_completion>( + barrier& __barrier) +{ + (void) __barrier; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, (return ::cuda::device::barrier_native_handle(__barrier);), NV_ANY_TARGET, (return nullptr;)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BARRIER_TRY_GET_BARRIER_HANDLE_H diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier index d10befdad26..0d65d4bf344 100644 --- a/libcudacxx/include/cuda/barrier +++ b/libcudacxx/include/cuda/barrier @@ -21,6 +21,22 @@ # pragma system_header #endif // no system header +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +# error "CUDA synchronization primitives are only supported for sm_70 and up." +#endif // __CUDA_ARCH__ < 700 + +#ifdef _LIBCUDACXX_HAS_NO_THREADS +# error is not supported on this single threaded system +#endif // _LIBCUDACXX_HAS_NO_THREADS + +#include +#include +#include +#include +#include +#include +#include +#include #include #include diff --git a/libcudacxx/include/cuda/std/__barrier/barrier.h b/libcudacxx/include/cuda/std/__barrier/barrier.h new file mode 100644 index 00000000000..5956a49d24e --- /dev/null +++ b/libcudacxx/include/cuda/std/__barrier/barrier.h @@ -0,0 +1,228 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __LIBCUDACXX___BARRIER_BARRIER_H +#define __LIBCUDACXX___BARRIER_BARRIER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#if _LIBCUDACXX_CUDA_ABI_VERSION < 3 +# define _LIBCUDACXX_BARRIER_ALIGNMENTS alignas(64) +#else // ^^^ _LIBCUDACXX_CUDA_ABI_VERSION < 3 ^^^ / vvv _LIBCUDACXX_CUDA_ABI_VERSION >= 3 vvv +# define _LIBCUDACXX_BARRIER_ALIGNMENTS +#endif // _LIBCUDACXX_CUDA_ABI_VERSION >= 3 + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +class __barrier_base +{ + _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __expected, __arrived; + _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion; + _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __phase; + +public: + using arrival_token = bool; + +private: + template + friend class __barrier_poll_tester_phase; + template + friend class __barrier_poll_tester_parity; + template + _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase); + template + _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity); + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const + { + return __phase.load(memory_order_acquire) != __old; + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const + { + return __try_wait(__parity); + } + +public: + _CCCL_HIDE_FROM_ABI __barrier_base() = default; + + _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) + : __expected(__expected) + , __arrived(__expected) + , __completion(__completion) + , __phase(false) + {} + + _CCCL_HIDE_FROM_ABI ~__barrier_base() = default; + + __barrier_base(__barrier_base const&) = delete; + __barrier_base& operator=(__barrier_base const&) = delete; + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1) + { + auto const __old_phase = __phase.load(memory_order_relaxed); + auto const __result = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update; + auto const __new_expected = __expected.load(memory_order_relaxed); + + _CCCL_ASSERT(__result >= 0, ""); + + if (0 == __result) + { + __completion(); + __arrived.store(__new_expected, memory_order_relaxed); + __phase.store(!__old_phase, memory_order_release); + __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{}); + } + return __old_phase; + } + _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const + { + __phase.wait(__old_phase, memory_order_acquire); + } + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() + { + wait(arrive()); + } + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() + { + __expected.fetch_sub(1, memory_order_relaxed); + (void) arrive(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept + { + return numeric_limits::max(); + } +}; + +template +class __barrier_base<__empty_completion, _Sco> +{ + static constexpr uint64_t __expected_unit = 1ull; + static constexpr uint64_t __arrived_unit = 1ull << 32; + static constexpr uint64_t __expected_mask = __arrived_unit - 1; + static constexpr uint64_t __phase_bit = 1ull << 63; + static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; + + _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __phase_arrived_expected; + +public: + using arrival_token = uint64_t; + +private: + template + friend class __barrier_poll_tester_phase; + template + friend class __barrier_poll_tester_parity; + template + _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase); + template + _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity); + + static _LIBCUDACXX_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) noexcept + { +#if _CCCL_STD_VER >= 2014 + // This debug assert is not supported in C++11 due to resulting in a + // multi-statement constexpr function. + _CCCL_ASSERT(__count >= 0, "Count must be non-negative."); +#endif // _CCCL_STD_VER >= 2014 + return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_phase(uint64_t __phase) const + { + uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); + return ((__current & __phase_bit) != __phase); + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const + { + return __try_wait_phase(__old & __phase_bit); + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const + { + return __try_wait_phase(__parity ? __phase_bit : 0); + } + +public: + _CCCL_HIDE_FROM_ABI __barrier_base() = default; + + _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 + __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) + : __phase_arrived_expected(__init(__count)) + { + _CCCL_ASSERT(__count >= 0, ""); + } + + _CCCL_HIDE_FROM_ABI ~__barrier_base() = default; + + __barrier_base(__barrier_base const&) = delete; + __barrier_base& operator=(__barrier_base const&) = delete; + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1) + { + auto const __inc = __arrived_unit * __update; + auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); + if ((__old ^ (__old + __inc)) & __phase_bit) + { + __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); + __phase_arrived_expected.notify_all(); + } + return __old & __phase_bit; + } + _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const + { + __libcpp_thread_poll_with_backoff(__barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase))); + } + _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const + { + __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); + } + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() + { + wait(arrive()); + } + _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() + { + __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); + (void) arrive(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept + { + return numeric_limits::max(); + } +}; + +template +class barrier : public __barrier_base<_CompletionF> +{ +public: + _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) + : __barrier_base<_CompletionF>(__count, __completion) + {} +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX___BARRIER_BARRIER_H diff --git a/libcudacxx/include/cuda/std/__barrier/empty_completion.h b/libcudacxx/include/cuda/std/__barrier/empty_completion.h new file mode 100644 index 00000000000..7205748ccc7 --- /dev/null +++ b/libcudacxx/include/cuda/std/__barrier/empty_completion.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H +#define __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +struct __empty_completion +{ + _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator()() noexcept {} +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX___BARRIER_EMPTY_COMPLETION_H diff --git a/libcudacxx/include/cuda/std/__barrier/poll_tester.h b/libcudacxx/include/cuda/std/__barrier/poll_tester.h new file mode 100644 index 00000000000..6bcdb17e9ea --- /dev/null +++ b/libcudacxx/include/cuda/std/__barrier/poll_tester.h @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __LIBCUDACXX___BARRIER_POLL_TESTER_H +#define __LIBCUDACXX___BARRIER_POLL_TESTER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +class __barrier_poll_tester_phase +{ + _Barrier const* __this; + typename _Barrier::arrival_token __phase; + +public: + _LIBCUDACXX_HIDE_FROM_ABI + __barrier_poll_tester_phase(_Barrier const* __this_, typename _Barrier::arrival_token&& __phase_) + : __this(__this_) + , __phase(_CUDA_VSTD::move(__phase_)) + {} + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const + { + return __this->__try_wait(__phase); + } +}; + +template +class __barrier_poll_tester_parity +{ + _Barrier const* __this; + bool __parity; + +public: + _LIBCUDACXX_HIDE_FROM_ABI __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_) + : __this(__this_) + , __parity(__parity_) + {} + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const + { + return __this->__try_wait_parity(__parity); + } +}; + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool +__call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase) +{ + return __b.__try_wait(_CUDA_VSTD::move(__phase)); +} + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait_parity(const _Barrier& __b, bool __parity) +{ + return __b.__try_wait_parity(__parity); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX___BARRIER_POLL_TESTER_H diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h deleted file mode 100644 index 5f77bec44cd..00000000000 --- a/libcudacxx/include/cuda/std/__cuda/barrier.h +++ /dev/null @@ -1,1301 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX___CUDA_BARRIER_H -#define _LIBCUDACXX___CUDA_BARRIER_H - -#include - -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 -# error "CUDA synchronization primitives are only supported for sm_70 and up." -#endif - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include // _CUDA_VSTD::void_t - -#if defined(_CCCL_CUDA_COMPILER) -# include // cuda::ptx::* -#endif // _CCCL_CUDA_COMPILER - -#if defined(_CCCL_COMPILER_NVRTC) -# define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type*) 0)->member)) -#else -# define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !offsetof(type, member) -#endif - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA - -// foward declaration required for memcpy_async, pipeline "sync" defined here -template -class pipeline; - -template <_CUDA_VSTD::size_t _Alignment> -struct aligned_size_t -{ - static constexpr _CUDA_VSTD::size_t align = _Alignment; - _CUDA_VSTD::size_t value; - _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s) - : value(__s) - {} - _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const - { - return value; - } -}; - -// Type only used for logging purpose -enum async_contract_fulfillment -{ - none, - async -}; - -// __completion_mechanism allows memcpy_async to report back what completion -// mechanism it used. This is necessary to determine in which way to synchronize -// the memcpy_async with a sync object (barrier or pipeline). -// -// In addition, we use this enum to create bit flags so that calling functions -// can specify which completion mechanisms can be used (__sync is always -// allowed). -enum class __completion_mechanism -{ - __sync = 0, - __mbarrier_complete_tx = 1 << 0, // Use powers of two here to support the - __async_group = 1 << 1, // bit flag use case - __async_bulk_group = 1 << 2, -}; - -template -class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco> -{ -public: - _CCCL_HIDE_FROM_ABI barrier() = default; - - barrier(const barrier&) = delete; - barrier& operator=(const barrier&) = delete; - - _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected, - _CompletionF __completion = _CompletionF()) - : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion) - {} - - _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected) - { - _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); - new (__b) barrier(__expected); - } - - _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion) - { - _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); - new (__b) barrier(__expected, __completion); - } -}; - -struct __block_scope_barrier_base -{}; - -_LIBCUDACXX_END_NAMESPACE_CUDA - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE - -_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier& b); - -_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA - -template <> -class barrier : public __block_scope_barrier_base -{ - using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>; - __barrier_base __barrier; - - _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t* - device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(barrier& b); - - template - friend class _CUDA_VSTD::__barrier_poll_tester_phase; - template - friend class _CUDA_VSTD::__barrier_poll_tester_parity; - -public: - using arrival_token = typename __barrier_base::arrival_token; - _CCCL_HIDE_FROM_ABI barrier() = default; - - barrier(const barrier&) = delete; - barrier& operator=(const barrier&) = delete; - - _LIBCUDACXX_HIDE_FROM_ABI barrier(_CUDA_VSTD::ptrdiff_t __expected, - _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion()) - { - static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier, __barrier), - "fatal error: bad barrier layout"); - init(this, __expected, __completion); - } - - _LIBCUDACXX_HIDE_FROM_ABI ~barrier() - { - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - if (__isShared(&__barrier)) { - asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( - __cvta_generic_to_shared(&__barrier))) - : "memory"); - } else if (__isClusterShared(&__barrier)) { __trap(); }), - NV_PROVIDES_SM_80, - (if (__isShared(&__barrier)) { - asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( - __cvta_generic_to_shared(&__barrier))) - : "memory"); - })) - } - - _LIBCUDACXX_HIDE_FROM_ABI friend void init( - barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CUDA_VSTD::__empty_completion = _CUDA_VSTD::__empty_completion()) - { - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - if (__isShared(&__b->__barrier)) { - asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"( - static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected)) - : "memory"); - } else if (__isClusterShared(&__b->__barrier)) { __trap(); } else { - new (&__b->__barrier) __barrier_base(__expected); - }), - NV_PROVIDES_SM_80, - ( - if (__isShared(&__b->__barrier)) { - asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"( - static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__expected)) - : "memory"); - } else { new (&__b->__barrier) __barrier_base(__expected); }), - NV_ANY_TARGET, - (new (&__b->__barrier) __barrier_base(__expected);)) - } - - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1) - { - _CCCL_ASSERT(__update >= 0, "Arrival count update must be non-negative."); - arrival_token __token = {}; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - if (!__isClusterShared(&__barrier)) { return __barrier.arrive(__update); } else if (!__isShared(&__barrier)) { - __trap(); - } - // Cannot use cuda::device::barrier_native_handle here, as it is - // only defined for block-scope barriers. This barrier may be a - // non-block scoped barrier. - auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); - __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);), - NV_PROVIDES_SM_80, - ( - if (!__isShared(&__barrier)) { - return __barrier.arrive(__update); - } auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); - // Need 2 instructions, can't finish barrier with arrive > 1 - if (__update > 1) { _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } __token = - _CUDA_VPTX::mbarrier_arrive(__bh);), - NV_IS_DEVICE, - ( - if (!__isShared(&__barrier)) { return __barrier.arrive(__update); } - - unsigned int __mask = __activemask(); - unsigned int __activeA = __match_any_sync(__mask, __update); - unsigned int __activeB = __match_any_sync(__mask, reinterpret_cast<_CUDA_VSTD::uintptr_t>(&__barrier)); - unsigned int __active = __activeA & __activeB; - int __inc = __popc(__active) * __update; - - unsigned __laneid; - asm("mov.u32 %0, %%laneid;" - : "=r"(__laneid)); - int __leader = __ffs(__active) - 1; - // All threads in mask synchronize here, establishing cummulativity to the __leader: - __syncwarp(__mask); - if (__leader == static_cast(__laneid)) { - __token = __barrier.arrive(__inc); - } __token = __shfl_sync(__active, __token, __leader);), - NV_IS_HOST, - (__token = __barrier.arrive(__update);)) - return __token; - } - -private: - _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_sm_80(arrival_token __token) const - { - (void) __token; - int32_t __ready = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_80, - (asm volatile("{\n\t" - ".reg .pred p;\n\t" - "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token) - : "memory");)) - return __ready; - } - - // Document de drop > uint32_t for __nanosec on public for APIs - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token) const - { - (void) __token; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - int32_t __ready = 0; if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token)); - } else if (!__isShared(&__barrier)) { - __trap(); - } asm volatile("{\n\t" - ".reg .pred p;\n\t" - "mbarrier.try_wait.shared.b64 p, [%1], %2;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token) - : "memory"); - return __ready;), - NV_PROVIDES_SM_80, - (if (!__isShared(&__barrier)) { - return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token)); - } return __test_wait_sm_80(__token);), - NV_ANY_TARGET, - (return _CUDA_VSTD::__call_try_wait(__barrier, _CUDA_VSTD::move(__token));)) - } - - // Document de drop > uint32_t for __nanosec on public for APIs - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const - { - if (__nanosec.count() < 1) - { - return __try_wait(_CUDA_VSTD::move(__token)); - } - - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - int32_t __ready = 0; - if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); - } else if (!__isShared(&__barrier)) { __trap(); } - - _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = - _CUDA_VSTD::chrono::high_resolution_clock::now(); - _CUDA_VSTD::chrono::nanoseconds __elapsed; - do { - const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count()); - asm volatile( - "{\n\t" - ".reg .pred p;\n\t" - "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "l"(__token), - "r"(__wait_nsec) - : "memory"); - __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start; - } while (!__ready && (__nanosec > __elapsed)); - return __ready;), - NV_PROVIDES_SM_80, - ( - bool __ready = 0; - if (!__isShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); - } - - _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = - _CUDA_VSTD::chrono::high_resolution_clock::now(); - do { - __ready = __test_wait_sm_80(__token); - } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start)); - return __ready;), - NV_ANY_TARGET, - (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), - _CUDA_VSTD::chrono::nanoseconds(__nanosec));)) - } - - _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_parity_sm_80(bool __phase_parity) const - { - (void) __phase_parity; - uint16_t __ready = 0; - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_80, - (asm volatile( - "{" - ".reg .pred %%p;" - "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;" - "selp.u16 %0, 1, 0, %%p;" - "}" - : "=h"(__ready) - : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__phase_parity)) - : "memory");)) - return __ready; - } - - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity) const - { - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); - } else if (!__isShared(&__barrier)) { __trap(); } int32_t __ready = 0; - - asm volatile( - "{\n\t" - ".reg .pred p;\n\t" - "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)) - :); - - return __ready;), - NV_PROVIDES_SM_80, - (if (!__isShared(&__barrier)) { return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity); } - - return __test_wait_parity_sm_80(__phase_parity);), - NV_ANY_TARGET, - (return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);)) - } - - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const - { - if (__nanosec.count() < 1) - { - return __try_wait_parity(__phase_parity); - } - - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - int32_t __ready = 0; - if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); - } else if (!__isShared(&__barrier)) { __trap(); } - - _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = - _CUDA_VSTD::chrono::high_resolution_clock::now(); - _CUDA_VSTD::chrono::nanoseconds __elapsed; - do { - const _CUDA_VSTD::uint32_t __wait_nsec = static_cast<_CUDA_VSTD::uint32_t>((__nanosec - __elapsed).count()); - asm volatile( - "{\n\t" - ".reg .pred p;\n\t" - "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)), - "r"(__wait_nsec) - : "memory"); - __elapsed = _CUDA_VSTD::chrono::high_resolution_clock::now() - __start; - } while (!__ready && (__nanosec > __elapsed)); - - return __ready;), - NV_PROVIDES_SM_80, - ( - bool __ready = 0; - if (!__isShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); - } - - _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = - _CUDA_VSTD::chrono::high_resolution_clock::now(); - do { - __ready = __test_wait_parity_sm_80(__phase_parity); - } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start)); - - return __ready;), - NV_ANY_TARGET, - (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec);)) - } - -public: - _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const - { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__phase))); - } - - _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const - { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff( - _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity)); - } - - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() - { - wait(arrive()); - } - - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() - { - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_90, - ( - if (!__isClusterShared(&__barrier)) { return __barrier.arrive_and_drop(); } else if (!__isShared(&__barrier)) { - __trap(); - } - - asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( - __cvta_generic_to_shared(&__barrier))) - : "memory");), - NV_PROVIDES_SM_80, - ( - // Fallback to slowpath on device - if (!__isShared(&__barrier)) { - __barrier.arrive_and_drop(); - return; - } - - asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( - __cvta_generic_to_shared(&__barrier))) - : "memory");), - NV_ANY_TARGET, - ( - // Fallback to slowpath on device - __barrier.arrive_and_drop();)) - } - - _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept - { - return (1 << 20) - 1; - } - - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool - try_wait_for(arrival_token&& __token, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur) - { - auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur); - - return __try_wait(_CUDA_VSTD::move(__token), __nanosec); - } - - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool - try_wait_until(arrival_token&& __token, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time) - { - return try_wait_for(_CUDA_VSTD::move(__token), (__time - _Clock::now())); - } - - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool - try_wait_parity_for(bool __phase_parity, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur) - { - auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur); - - return __try_wait_parity(__phase_parity, __nanosec); - } - - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool - try_wait_parity_until(bool __phase_parity, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time) - { - return try_wait_parity_for(__phase_parity, (__time - _Clock::now())); - } -}; - -_LIBCUDACXX_END_NAMESPACE_CUDA - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE - -_CCCL_DEVICE inline _CUDA_VSTD::uint64_t* barrier_native_handle(barrier& b) -{ - return reinterpret_cast<_CUDA_VSTD::uint64_t*>(&b.__barrier); -} - -#if defined(_CCCL_CUDA_COMPILER) - -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); -_CCCL_NODISCARD _CCCL_DEVICE inline barrier::arrival_token barrier_arrive_tx( - barrier& __b, - _CUDA_VSTD::ptrdiff_t __arrive_count_update, - _CUDA_VSTD::ptrdiff_t __transaction_count_update) -{ - _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); - _CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one."); - _CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1."); - _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object - _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); - - barrier::arrival_token __token = {}; - // On architectures pre-sm90, arrive_tx is not supported. - // We do not check for the statespace of the barrier here. This is - // on purpose. This allows debugging tools like memcheck/racecheck - // to detect that we are passing a pointer with the wrong state - // space to mbarrier.arrive. If we checked for the state space here, - // and __trap() if wrong, then those tools would not be able to help - // us in release builds. In debug builds, the error would be caught - // by the asserts at the top of this function. - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - - auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle); - if (__arrive_count_update == 1) { - __token = _CUDA_VPTX::mbarrier_arrive_expect_tx( - _CUDA_VPTX::sem_release, - _CUDA_VPTX::scope_cta, - _CUDA_VPTX::space_shared, - __native_handle, - __transaction_count_update); - } else { - asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory"); - __token = _CUDA_VPTX::mbarrier_arrive( - _CUDA_VPTX::sem_release, - _CUDA_VPTX::scope_cta, - _CUDA_VPTX::space_shared, - __native_handle, - __arrive_count_update); - }), - (__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();)); - return __token; -} - -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__(); -_CCCL_DEVICE inline void -barrier_expect_tx(barrier& __b, _CUDA_VSTD::ptrdiff_t __transaction_count_update) -{ - _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); - _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object - _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); - - // We do not check for the statespace of the barrier here. This is - // on purpose. This allows debugging tools like memcheck/racecheck - // to detect that we are passing a pointer with the wrong state - // space to mbarrier.arrive. If we checked for the state space here, - // and __trap() if wrong, then those tools would not be able to help - // us in release builds. In debug builds, the error would be caught - // by the asserts at the top of this function. - // On architectures pre-sm90, arrive_tx is not supported. - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); - asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory");), - (__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();)); -} - -extern "C" _CCCL_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx( - _Tp* __dest, - const _Tp* __src, - ::cuda::aligned_size_t<_Alignment> __size, - ::cuda::barrier<::cuda::thread_scope_block>& __b) -{ - // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are - // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with - // memcpy_async when compiling with GCC 4.8. - // FIXME: remove the #if once GCC 4.8 is no longer supported. -# if !defined(_CCCL_COMPILER_GCC) || _GNUC_VER > 408 - static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type"); -# endif - static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); - - _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); - _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory."); - _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory."); - - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - if (__isShared(__dest) && __isGlobal(__src)) { - _CUDA_VPTX::cp_async_bulk( - _CUDA_VPTX::space_cluster, - _CUDA_VPTX::space_global, - __dest, - __src, - static_cast(__size), - barrier_native_handle(__b)); - } else { - // memcpy_async_tx only supports copying from global to shared - // or from shared to remote cluster dsmem. To copy to remote - // dsmem, we need to arrive on a cluster-scoped barrier, which - // is not yet implemented. So we trap in this case as well. - _CCCL_UNREACHABLE(); - }), - (__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();)); - - return async_contract_fulfillment::async; -} -# endif // __cccl_ptx_isa >= 800 -#endif // _CCCL_CUDA_COMPILER - -_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE - -#if defined(_CCCL_CUDA_COMPILER) - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA - -template <> -class barrier : private barrier -{ - using __base = barrier; - -public: - using __base::__base; - - _LIBCUDACXX_HIDE_FROM_ABI friend void - init(barrier* __b, - _CUDA_VSTD::ptrdiff_t __expected, - _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion()) - { - init(static_cast<__base*>(__b), __expected, __completion); - } - - using __base::arrive; - using __base::arrive_and_drop; - using __base::arrive_and_wait; - using __base::max; - using __base::wait; -}; - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty...) -{ - return true; -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty&) -{ - return true; -} - -// __is_local_smem_barrier returns true if barrier is (1) block-scoped and (2) located in shared memory. -template ::value> -_LIBCUDACXX_HIDE_FROM_ABI bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier) -{ - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return _Is_mbarrier && __isShared(&__barrier);), (return false;)); -} - -// __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise. -template -_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier) -{ - return nullptr; -} - -template <> -_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* -__try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_completion>( - barrier<::cuda::thread_scope_block>& __barrier) -{ - (void) __barrier; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, (return ::cuda::device::barrier_native_handle(__barrier);), NV_ANY_TARGET, (return nullptr;)); -} - -// This struct contains functions to defer the completion of a barrier phase -// or pipeline stage until a specific memcpy_async operation *initiated by -// this thread* has completed. - -// The user is still responsible for arriving and waiting on (or otherwise -// synchronizing with) the barrier or pipeline barrier to see the results of -// copies from other threads participating in the synchronization object. -struct __memcpy_completion_impl -{ - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment - __defer(__completion_mechanism __cm, - _Group const& __group, - _CUDA_VSTD::size_t __size, - barrier<::cuda::thread_scope_block>& __barrier) - { - // In principle, this is the overload for shared memory barriers. However, a - // block-scope barrier may also be located in global memory. Therefore, we - // check if the barrier is a non-smem barrier and handle that separately. - if (!__is_local_smem_barrier(__barrier)) - { - return __defer_non_smem_barrier(__cm, __group, __size, __barrier); - } - - switch (__cm) - { - case __completion_mechanism::__async_group: - // Pre-SM80, the async_group mechanism is not available. - NV_IF_TARGET( - NV_PROVIDES_SM_80, - ( - // Non-Blocking: unbalance barrier by 1, barrier will be - // rebalanced when all thread-local cp.async instructions - // have completed writing to shared memory. - _CUDA_VSTD::uint64_t* __bh = __try_get_barrier_handle(__barrier); - - asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>( - __cvta_generic_to_shared(__bh))) - : "memory");)); - return async_contract_fulfillment::async; - case __completion_mechanism::__async_bulk_group: - // This completion mechanism should not be used with a shared - // memory barrier. Or at least, we do not currently envision - // bulk group to be used with shared memory barriers. - _CCCL_UNREACHABLE(); - case __completion_mechanism::__mbarrier_complete_tx: -# if __cccl_ptx_isa >= 800 - // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available. - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // Only perform the expect_tx operation with the leader thread - if (__group.thread_rank() == 0) { ::cuda::device::barrier_expect_tx(__barrier, __size); })); -# endif // __cccl_ptx_isa >= 800 - return async_contract_fulfillment::async; - case __completion_mechanism::__sync: - // sync: In this case, we do not need to do anything. The user will have - // to issue `bar.arrive_wait();` to see the effect of the transaction. - return async_contract_fulfillment::none; - default: - // Get rid of "control reaches end of non-void function": - _CCCL_UNREACHABLE(); - } - } - - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer( - __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) - { - return __defer_non_smem_barrier(__cm, __group, __size, __barrier); - } - - template - _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer_non_smem_barrier( - __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) - { - // Overload for non-smem barriers. - - switch (__cm) - { - case __completion_mechanism::__async_group: - // Pre-SM80, the async_group mechanism is not available. - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // Blocking: wait for all thread-local cp.async instructions to have - // completed writing to shared memory. - asm volatile("cp.async.wait_all;" :: - : "memory");)); - return async_contract_fulfillment::async; - case __completion_mechanism::__mbarrier_complete_tx: - // Non-smem barriers do not have an mbarrier_complete_tx mechanism.. - _CCCL_UNREACHABLE(); - case __completion_mechanism::__async_bulk_group: - // This completion mechanism is currently not expected to be used with barriers. - _CCCL_UNREACHABLE(); - case __completion_mechanism::__sync: - // sync: In this case, we do not need to do anything. - return async_contract_fulfillment::none; - default: - // Get rid of "control reaches end of non-void function": - _CCCL_UNREACHABLE(); - } - } - - template - _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment - __defer(__completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, pipeline<_Sco>& __pipeline) - { - // pipeline does not sync on memcpy_async, defeat pipeline purpose otherwise - __unused(__pipeline); - __unused(__size); - __unused(__group); - - switch (__cm) - { - case __completion_mechanism::__async_group: - return async_contract_fulfillment::async; - case __completion_mechanism::__async_bulk_group: - return async_contract_fulfillment::async; - case __completion_mechanism::__mbarrier_complete_tx: - return async_contract_fulfillment::async; - case __completion_mechanism::__sync: - return async_contract_fulfillment::none; - default: - // Get rid of "control reaches end of non-void function": - _CCCL_UNREACHABLE(); - } - } -}; - -/*********************************************************************** - * memcpy_async code: - * - * A call to cuda::memcpy_async(dest, src, size, barrier) can dispatch to any of - * these PTX instructions: - * - * 1. normal synchronous copy (fallback) - * 2. cp.async: shared <- global - * 3. cp.async.bulk: shared <- global - * 4. TODO: cp.async.bulk: global <- shared - * 5. TODO: cp.async.bulk: cluster <- shared - * - * Which of these options is chosen, depends on: - * - * 1. The alignment of dest, src, and size; - * 2. The direction of the copy - * 3. The current compute capability - * 4. The requested completion mechanism - * - * PTX has 3 asynchronous completion mechanisms: - * - * 1. Async group - local to a thread. Used by cp.async - * 2. Bulk async group - local to a thread. Used by cp.async.bulk (shared -> global) - * 3. mbarrier::complete_tx - shared memory barier. Used by cp.async.bulk (other directions) - * - * The code is organized as follows: - * - * 1. Asynchronous copy mechanisms that wrap the PTX instructions - * - * 2. Device memcpy_async implementation per copy direction (global to shared, - * shared to global, etc). Dispatches to fastest mechanism based on requested - * completion mechanism(s), pointer alignment, and architecture. - * - * 3. Host and device memcpy_async implementations. Host implementation is - * basically a memcpy wrapper; device implementation dispatches based on the - * direction of the copy. - * - * 4. __memcpy_async_barrier: - * a) Sets the allowed completion mechanisms based on the barrier location - * b) Calls the host or device memcpy_async implementation - * c) If necessary, synchronizes with the barrier based on the returned - * completion mechanism. - * - * 5. The public memcpy_async function overloads. Call into - * __memcpy_async_barrier. - * - ***********************************************************************/ - -/*********************************************************************** - * Asynchronous copy mechanisms: - * - * 1. cp.async.bulk: shared <- global - * 2. TODO: cp.async.bulk: cluster <- shared - * 3. TODO: cp.async.bulk: global <- shared - * 4. cp.async: shared <- global - * 5. normal synchronous copy (fallback) - ***********************************************************************/ - -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); -template -inline __device__ void -__cp_async_bulk_shared_global(const _Group& __g, char* __dest, const char* __src, size_t __size, uint64_t* __bar_handle) -{ - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, - (if (__g.thread_rank() == 0) { - _CUDA_VPTX::cp_async_bulk( - _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __src, __size, __bar_handle); - }), - (__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); -template -inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) -{ - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async - - // If `if constexpr` is not available, this function gets instantiated even - // if is not called. Do not static_assert in that case. -# if _CCCL_STD_VER >= 2017 - static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, - "cp.async.shared.global requires a copy size of 4, 8, or 16."); -# endif // _CCCL_STD_VER >= 2017 - - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), - "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), - "n"(_Copy_size) - : "memory");), - (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); -} - -template <> -inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src) -{ - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async - // When copying 16 bytes, it is possible to skip L1 cache (.cg). - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" - : - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), - "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), - "n"(16) - : "memory");), - (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); -} - -template -inline __device__ void -__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) -{ - // If `if constexpr` is not available, this function gets instantiated even - // if is not called. Do not static_assert in that case. -# if _CCCL_STD_VER >= 2017 - static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment"); -# endif // _CCCL_STD_VER >= 2017 - - // Maximal copy size is 16. - constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment; - // We use an int offset here, because we are copying to shared memory, - // which is easily addressable using int. - const int __group_size = __g.size(); - const int __group_rank = __g.thread_rank(); - const int __stride = __group_size * __copy_size; - for (int __offset = __group_rank * __copy_size; __offset < static_cast(__size); __offset += __stride) - { - __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset); - } -} - -template -struct __copy_chunk -{ - _CCCL_ALIGNAS(_Copy_size) char data[_Copy_size]; -}; - -template -inline __host__ __device__ void -__cp_async_fallback_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) -{ - // Maximal copy size is 16 bytes - constexpr _CUDA_VSTD::size_t __copy_size = (_Alignment > 16) ? 16 : _Alignment; - using __chunk_t = __copy_chunk<__copy_size>; - - // "Group"-strided loop over memory - const size_t __stride = __g.size() * __copy_size; - - // An unroll factor of 64 ought to be enough for anybody. This unroll pragma - // is mainly intended to place an upper bound on loop unrolling. The number - // is more than high enough for the intended use case: an unroll factor of - // 64 allows moving 4 * 64 * 256 = 64kb in one unrolled loop with 256 - // threads (copying ints). On the other hand, in the unfortunate case that - // we have to move 1024 bytes / thread with char width, then we prevent - // fully unrolling the loop to 1024 copy instructions. This prevents the - // compile times from increasing unreasonably, and also has neglibible - // impact on runtime performance. - _LIBCUDACXX_PRAGMA_UNROLL(64) - for (_CUDA_VSTD::size_t __offset = __g.thread_rank() * __copy_size; __offset < __size; __offset += __stride) - { - __chunk_t tmp = *reinterpret_cast(__src + __offset); - *reinterpret_cast<__chunk_t*>(__dest + __offset) = tmp; - } -} - -/*********************************************************************** - * cuda::memcpy_async dispatch helper functions - * - * - __get_size_align struct to determine the alignment from a size type. - ***********************************************************************/ - -// The __get_size_align struct provides a way to query the guaranteed -// "alignment" of a provided size. In this case, an n-byte aligned size means -// that the size is a multiple of n. -// -// Use as follows: -// static_assert(__get_size_align::align == 1) -// static_assert(__get_size_align>::align == n) - -// Default impl: always returns 1. -template -struct __get_size_align -{ - static constexpr int align = 1; -}; - -// aligned_size_t overload: return n. -template -struct __get_size_align> -{ - static constexpr int align = T::align; -}; - -/*********************************************************************** - * cuda::memcpy_async dispatch - * - * The dispatch mechanism takes all the arguments and dispatches to the - * fastest asynchronous copy mechanism available. - * - * It returns a __completion_mechanism that indicates which completion mechanism - * was used by the copy mechanism. This value can be used by the sync object to - * further synchronize if necessary. - * - ***********************************************************************/ - -template <_CUDA_VSTD::size_t _Align, typename _Group> -_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_any_to_any( - _Group const& __group, - char* __dest_char, - char const* __src_char, - _CUDA_VSTD::size_t __size, - uint32_t __allowed_completions, - uint64_t* __bar_handle) -{ - __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size); - return __completion_mechanism::__sync; -} - -template <_CUDA_VSTD::size_t _Align, typename _Group> -_CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_async_global_to_shared( - _Group const& __group, - char* __dest_char, - char const* __src_char, - _CUDA_VSTD::size_t __size, - uint32_t __allowed_completions, - uint64_t* __bar_handle) -{ -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - (const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx); - _LIBCUDACXX_UNUSED_VAR(__can_use_complete_tx); - _CCCL_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), - "Pass non-null bar_handle if and only if can_use_complete_tx."); - _CCCL_IF_CONSTEXPR (_Align >= 16) { - if (__can_use_complete_tx && __isShared(__bar_handle)) - { - __cp_async_bulk_shared_global(__group, __dest_char, __src_char, __size, __bar_handle); - return __completion_mechanism::__mbarrier_complete_tx; - } - } - // Fallthrough to SM 80.. - )); -# endif // __cccl_ptx_isa >= 800 - - NV_IF_TARGET( - NV_PROVIDES_SM_80, - (_CCCL_IF_CONSTEXPR (_Align >= 4) { - const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group); - if (__can_use_async_group) - { - __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size); - return __completion_mechanism::__async_group; - } - } - // Fallthrough.. - )); - - __cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size); - return __completion_mechanism::__sync; -} - -// __dispatch_memcpy_async is the internal entry point for dispatching to the correct memcpy_async implementation. -template <_CUDA_VSTD::size_t _Align, typename _Group> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async( - _Group const& __group, - char* __dest_char, - char const* __src_char, - size_t __size, - _CUDA_VSTD::uint32_t __allowed_completions, - uint64_t* __bar_handle) -{ - NV_IF_ELSE_TARGET( - NV_IS_DEVICE, - ( - // Dispatch based on direction of the copy: global to shared, shared to - // global, etc. - - // CUDA compilers <= 12.2 may not propagate assumptions about the state space - // of pointers correctly. Therefore, we - // 1) put the code for each copy direction in a separate function, and - // 2) make sure none of the code paths can reach each other by "falling through". - // - // See nvbug 4074679 and also PR #478. - if (__isGlobal(__src_char) && __isShared(__dest_char)) { - return __dispatch_memcpy_async_global_to_shared<_Align>( - __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle); - } else { - return __dispatch_memcpy_async_any_to_any<_Align>( - __group, __dest_char, __src_char, __size, __allowed_completions, __bar_handle); - }), - ( - // Host code path: - if (__group.thread_rank() == 0) { - memcpy(__dest_char, __src_char, __size); - } return __completion_mechanism::__sync;)); -} - -template <_CUDA_VSTD::size_t _Align, typename _Group> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async( - _Group const& __group, - char* __dest_char, - char const* __src_char, - _CUDA_VSTD::size_t __size, - _CUDA_VSTD::uint32_t __allowed_completions) -{ - _CCCL_ASSERT(!(__allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx)), - "Cannot allow mbarrier_complete_tx completion mechanism when not passing a barrier. "); - return __dispatch_memcpy_async<_Align>(__group, __dest_char, __src_char, __size, __allowed_completions, nullptr); -} - -//////////////////////////////////////////////////////////////////////////////// - -struct __single_thread_group -{ - _LIBCUDACXX_HIDE_FROM_ABI void sync() const {} - _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t size() const - { - return 1; - }; - _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t thread_rank() const - { - return 0; - }; -}; - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier( - _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) -{ - static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); - - // 1. Determine which completion mechanisms can be used with the current - // barrier. A local shared memory barrier, i.e., block-scope barrier in local - // shared memory, supports the mbarrier_complete_tx mechanism in addition to - // the async group mechanism. - _CUDA_VSTD::uint32_t __allowed_completions = - __is_local_smem_barrier(__barrier) - ? (_CUDA_VSTD::uint32_t(__completion_mechanism::__async_group) - | _CUDA_VSTD::uint32_t(__completion_mechanism::__mbarrier_complete_tx)) - : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); - - // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. - constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align; - constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); - // Cast to char pointers. We don't need the type for alignment anymore and - // erasing the types reduces the number of instantiations of down-stream - // functions. - char* __dest_char = reinterpret_cast(__destination); - char const* __src_char = reinterpret_cast(__source); - - // 2. Issue actual copy instructions. - auto __bh = __try_get_barrier_handle(__barrier); - auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions, __bh); - - // 3. Synchronize barrier with copy instructions. - return __memcpy_completion_impl::__defer(__cm, __group, __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( - _Group const& __group, - _Tp* __destination, - _Tp const* __source, - aligned_size_t<_Alignment> __size, - barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment -memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( - _Group const& __group, - _Tp* __destination, - _Tp const* __source, - _CUDA_VSTD::size_t __size, - barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( - _Group const& __group, - void* __destination, - void const* __source, - _CUDA_VSTD::size_t __size, - barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier( - __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( - _Group const& __group, - void* __destination, - void const* __source, - aligned_size_t<_Alignment> __size, - barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier( - __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment -memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) -{ - return __memcpy_async_barrier( - __single_thread_group{}, - reinterpret_cast(__destination), - reinterpret_cast(__source), - __size, - __barrier); -} - -_LIBCUDACXX_END_NAMESPACE_CUDA - -#endif // _CCCL_CUDA_COMPILER - -#endif // _LIBCUDACXX___CUDA_BARRIER_H diff --git a/libcudacxx/include/cuda/std/barrier b/libcudacxx/include/cuda/std/barrier index 3eb61978768..f5b7f2c07e6 100644 --- a/libcudacxx/include/cuda/std/barrier +++ b/libcudacxx/include/cuda/std/barrier @@ -11,10 +11,6 @@ #ifndef _CUDA_STD_BARRIER #define _CUDA_STD_BARRIER -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 -# error "CUDA synchronization primitives are only supported for sm_70 and up." -#endif - #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -25,10 +21,27 @@ # pragma system_header #endif // no system header -_CCCL_PUSH_MACROS - -#include - -_CCCL_POP_MACROS +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +# error "CUDA synchronization primitives are only supported for sm_70 and up." +#endif // __CUDA_ARCH__ < 700 + +#ifdef _LIBCUDACXX_HAS_NO_THREADS +# error is not supported on this single threaded system +#endif // _LIBCUDACXX_HAS_NO_THREADS + +#include +#include +#include + +//! TODO: Drop cuda only features +#include +#include +#include +#include +#include +#include +#include +#include +#include #endif // _CUDA_STD_BARRIER diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config index 69610aae6b7..ee90af58e63 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config @@ -538,10 +538,6 @@ typedef __char32_t char32_t; # define _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE # endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE -# ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER -# define _LIBCUDACXX_HAS_NO_TREE_BARRIER -# endif // _LIBCUDACXX_HAS_NO_TREE_BARRIER - # ifndef _LIBCUDACXX_HAS_NO_WCHAR_H # define _LIBCUDACXX_HAS_NO_WCHAR_H # endif // _LIBCUDACXX_HAS_NO_WCHAR_H diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support index a56bfa94b12..5240ff7702e 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support @@ -685,12 +685,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __libcpp_contention_t* __libcpp_contention_state(void # endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE -# if !defined(_LIBCUDACXX_HAS_NO_TREE_BARRIER) && !defined(_LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX) - -_CCCL_VISIBILITY_DEFAULT extern thread_local ptrdiff_t __libcpp_thread_favorite_barrier_index; - -# endif - # ifndef __cuda_std__ class _CCCL_TYPE_VISIBILITY_DEFAULT thread; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier deleted file mode 100644 index d7b3cda99af..00000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier +++ /dev/null @@ -1,459 +0,0 @@ -// -*- C++ -*- -//===--------------------------- barrier ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_BARRIER -#define _LIBCUDACXX_BARRIER - -/* - barrier synopsis - -namespace std -{ - - template - class barrier - { - public: - using arrival_token = see below; - - constexpr explicit barrier(ptrdiff_t phase_count, - CompletionFunction f = CompletionFunction()); - ~barrier(); - - barrier(const barrier&) = delete; - barrier& operator=(const barrier&) = delete; - - [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); - void wait(arrival_token&& arrival) const; - - void arrive_and_wait(); - void arrive_and_drop(); - - private: - CompletionFunction __completion; // exposition only - }; - -} - -*/ - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include - -_CCCL_PUSH_MACROS - -#ifdef _LIBCUDACXX_HAS_NO_THREADS -# error is not supported on this single threaded system -#endif - -_LIBCUDACXX_BEGIN_NAMESPACE_STD - -struct __empty_completion -{ - _LIBCUDACXX_HIDE_FROM_ABI void operator()() noexcept {} -}; - -#ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER - -template -class alignas(64) __barrier_base -{ - ptrdiff_t __expected; - __atomic_impl __expected_adjustment; - _CompletionF __completion; - - using __phase_t = uint8_t; - __atomic_impl<__phase_t, _Sco> __phase; - - struct alignas(64) __state_t - { - struct - { - __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0); - } __tickets[64]; - }; - ::std::vector<__state_t> __state; - - _LIBCUDACXX_HIDE_FROM_ABI bool __arrive(__phase_t const __old_phase) - { - __phase_t const __half_step = __old_phase + 1, __full_step = __old_phase + 2; -# ifndef _LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX - ptrdiff_t __current = __libcpp_thread_favorite_barrier_index, -# else - ptrdiff_t __current = 0, -# endif - __current_expected = __expected, __last_node = (__current_expected >> 1); - for (size_t __round = 0;; ++__round) - { - _CCCL_ASSERT(__round <= 63, ""); - if (__current_expected == 1) - { - return true; - } - for (;; ++__current) - { -# ifndef _LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX - if (0 == __round) - { - if (__current >= __current_expected) - { - __current = 0; - } - __libcpp_thread_favorite_barrier_index = __current; - } -# endif - _CCCL_ASSERT(__current <= __last_node, ""); - __phase_t expect = __old_phase; - if (__current == __last_node && (__current_expected & 1)) - { - if (__state[__current].__tickets[__round].__phase.compare_exchange_strong( - expect, __full_step, memory_order_acq_rel)) - { - break; // I'm 1 in 1, go to next __round - } - _CCCL_ASSERT(expect == __full_step, ""); - } - else if (__state[__current].__tickets[__round].__phase.compare_exchange_strong( - expect, __half_step, memory_order_acq_rel)) - { - return false; // I'm 1 in 2, done with arrival - } - else if (expect == __half_step) - { - if (__state[__current].__tickets[__round].__phase.compare_exchange_strong( - expect, __full_step, memory_order_acq_rel)) - { - break; // I'm 2 in 2, go to next __round - } - _CCCL_ASSERT(expect == __full_step, ""); - } - _CCCL_ASSERT(__round == 0 && expect == __full_step, ""); - } - __current_expected = (__current_expected >> 1) + (__current_expected & 1); - __current &= ~(1 << __round); - __last_node &= ~(1 << __round); - } - } - -public: - using arrival_token = __phase_t; - - _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) - : __expected(__expected) - , __expected_adjustment(0) - , __completion(__completion) - , __phase(0) - , __state((__expected + 1) >> 1) - { - _CCCL_ASSERT(__expected >= 0, ""); - } - - _CCCL_HIDE_FROM_ABI ~__barrier_base() = default; - - __barrier_base(__barrier_base const&) = delete; - __barrier_base& operator=(__barrier_base const&) = delete; - - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t update = 1) - { - _CCCL_ASSERT(update > 0, ""); - auto __old_phase = __phase.load(memory_order_relaxed); - for (; update; --update) - { - if (__arrive(__old_phase)) - { - __completion(); - __expected += __expected_adjustment.load(memory_order_relaxed); - __expected_adjustment.store(0, memory_order_relaxed); - __phase.store(__old_phase + 2, memory_order_release); - } - } - return __old_phase; - } - _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const - { - __libcpp_thread_poll_with_backoff([=]() -> bool { - return __phase.load(memory_order_acquire) != __old_phase; - }); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() - { - wait(arrive()); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() - { - __expected_adjustment.fetch_sub(1, memory_order_relaxed); - (void) arrive(); - } -}; - -#else - -# if _LIBCUDACXX_CUDA_ABI_VERSION < 3 -# define _LIBCUDACXX_BARRIER_ALIGNMENTS alignas(64) -# else -# define _LIBCUDACXX_BARRIER_ALIGNMENTS -# endif - -template -class __barrier_poll_tester_phase -{ - _Barrier const* __this; - typename _Barrier::arrival_token __phase; - -public: - _LIBCUDACXX_HIDE_FROM_ABI - __barrier_poll_tester_phase(_Barrier const* __this_, typename _Barrier::arrival_token&& __phase_) - : __this(__this_) - , __phase(_CUDA_VSTD::move(__phase_)) - {} - - _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const - { - return __this->__try_wait(__phase); - } -}; - -template -class __barrier_poll_tester_parity -{ - _Barrier const* __this; - bool __parity; - -public: - _LIBCUDACXX_HIDE_FROM_ABI __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_) - : __this(__this_) - , __parity(__parity_) - {} - - _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const - { - return __this->__try_wait_parity(__parity); - } -}; - -template -_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase) -{ - return __b.__try_wait(_CUDA_VSTD::move(__phase)); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait_parity(const _Barrier& __b, bool __parity) -{ - return __b.__try_wait_parity(__parity); -} - -template -class __barrier_base -{ - _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __expected, __arrived; - _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion; - _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __phase; - -public: - using arrival_token = bool; - -private: - template - friend class __barrier_poll_tester_phase; - template - friend class __barrier_poll_tester_parity; - template - _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase); - template - _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity); - - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const - { - return __phase.load(memory_order_acquire) != __old; - } - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const - { - return __try_wait(__parity); - } - -public: - _CCCL_HIDE_FROM_ABI __barrier_base() = default; - - _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) - : __expected(__expected) - , __arrived(__expected) - , __completion(__completion) - , __phase(false) - {} - - _CCCL_HIDE_FROM_ABI ~__barrier_base() = default; - - __barrier_base(__barrier_base const&) = delete; - __barrier_base& operator=(__barrier_base const&) = delete; - - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1) - { - auto const __old_phase = __phase.load(memory_order_relaxed); - auto const __result = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update; - auto const __new_expected = __expected.load(memory_order_relaxed); - - _CCCL_ASSERT(__result >= 0, ""); - - if (0 == __result) - { - __completion(); - __arrived.store(__new_expected, memory_order_relaxed); - __phase.store(!__old_phase, memory_order_release); - __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{}); - } - return __old_phase; - } - _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const - { - __phase.wait(__old_phase, memory_order_acquire); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() - { - wait(arrive()); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() - { - __expected.fetch_sub(1, memory_order_relaxed); - (void) arrive(); - } - - _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept - { - return numeric_limits::max(); - } -}; - -template -class __barrier_base<__empty_completion, _Sco> -{ - static constexpr uint64_t __expected_unit = 1ull; - static constexpr uint64_t __arrived_unit = 1ull << 32; - static constexpr uint64_t __expected_mask = __arrived_unit - 1; - static constexpr uint64_t __phase_bit = 1ull << 63; - static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; - - _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl __phase_arrived_expected; - -public: - using arrival_token = uint64_t; - -private: - template - friend class __barrier_poll_tester_phase; - template - friend class __barrier_poll_tester_parity; - template - _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase); - template - _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity); - - static _LIBCUDACXX_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) noexcept - { -# if _CCCL_STD_VER > 2011 - // This debug assert is not supported in C++11 due to resulting in a - // multi-statement constexpr function. - _CCCL_ASSERT(__count >= 0, "Count must be non-negative."); -# endif // _CCCL_STD_VER > 2011 - return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); - } - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_phase(uint64_t __phase) const - { - uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); - return ((__current & __phase_bit) != __phase); - } - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const - { - return __try_wait_phase(__old & __phase_bit); - } - _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const - { - return __try_wait_phase(__parity ? __phase_bit : 0); - } - -public: - _CCCL_HIDE_FROM_ABI __barrier_base() = default; - - _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 - __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) - : __phase_arrived_expected(__init(__count)) - { - _CCCL_ASSERT(__count >= 0, ""); - } - - _CCCL_HIDE_FROM_ABI ~__barrier_base() = default; - - __barrier_base(__barrier_base const&) = delete; - __barrier_base& operator=(__barrier_base const&) = delete; - - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1) - { - auto const __inc = __arrived_unit * __update; - auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); - if ((__old ^ (__old + __inc)) & __phase_bit) - { - __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); - __phase_arrived_expected.notify_all(); - } - return __old & __phase_bit; - } - _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const - { - __libcpp_thread_poll_with_backoff(__barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase))); - } - _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const - { - __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() - { - wait(arrive()); - } - _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop() - { - __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); - (void) arrive(); - } - - _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept - { - return numeric_limits::max(); - } -}; - -#endif //_LIBCUDACXX_HAS_NO_TREE_BARRIER - -template -class barrier : public __barrier_base<_CompletionF> -{ -public: - _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) - : __barrier_base<_CompletionF>(__count, __completion) - {} -}; - -_LIBCUDACXX_END_NAMESPACE_STD - -#include - -_CCCL_POP_MACROS - -#endif //_LIBCUDACXX_BARRIER From 8aaeb2959b77fd8e27172737b21dd9247904b511 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Mon, 7 Oct 2024 18:37:22 -0400 Subject: [PATCH 4/9] Consolidate header testing infra. (#2460) --- CMakeLists.txt | 1 + cmake/CCCLGenerateHeaderTests.cmake | 111 ++++++++++++++++++ .../header_test.in => cmake/header_test.cu.in | 59 +++++----- cub/cmake/CubHeaderTesting.cmake | 24 ++-- cub/cmake/header_test.in | 72 ------------ cudax/cmake/cudaxHeaderTesting.cmake | 38 +++--- cudax/cmake/header_test.in.cu | 20 +++- thrust/cmake/ThrustHeaderTesting.cmake | 94 +++++++-------- 8 files changed, 226 insertions(+), 193 deletions(-) create mode 100644 cmake/CCCLGenerateHeaderTests.cmake rename thrust/cmake/header_test.in => cmake/header_test.cu.in (52%) delete mode 100644 cub/cmake/header_test.in diff --git a/CMakeLists.txt b/CMakeLists.txt index f87ad146174..5e9f68c14eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,7 @@ if (CCCL_TOPLEVEL_PROJECT) include(cmake/CCCLBuildCompilerTargets.cmake) include(cmake/CCCLClangdCompileInfo.cmake) include(cmake/CCCLConfigureTarget.cmake) + include(cmake/CCCLGenerateHeaderTests.cmake) include(cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() diff --git a/cmake/CCCLGenerateHeaderTests.cmake b/cmake/CCCLGenerateHeaderTests.cmake new file mode 100644 index 00000000000..e483b194513 --- /dev/null +++ b/cmake/CCCLGenerateHeaderTests.cmake @@ -0,0 +1,111 @@ +# Usage: +# cccl_generate_header_tests( +# [cccl_configure_target options] +# [LANGUAGE ] +# [HEADER_TEMPLATE