PTX: Add cuda::ptx:barrier_cluster_{arrive,wait} (#1366)

ahendriksen · web-flow · commit 2f09e3d287db · 2024-02-27T15:41:24.000Z
* Add `cuda::ptx::barrier.cluster.{arrive,wait}`

* Add note about .aligned variants
diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md
@@ -876,7 +876,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 |-----------------------|-------------------------|
 | [`bar, barrier`]      | No                      |
 | [`bar.warp.sync`]     | No                      |
-| [`barrier.cluster`]   | No                      |
+| [`barrier.cluster`]   | CTK-FUTURE, CCCL v2.4.0 |
 | [`membar`]            | No                      |
 | [`fence`]             | CTK-FUTURE, CCCL v2.4.0 |
 | [`atom`]              | No                      |
@@ -892,7 +892,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 
 [`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
 [`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
-[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+[`barrier.cluster`]: #barriercluster
 [`membar`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
 [`fence`]: #fence
 [`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
@@ -906,6 +906,52 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 [`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
 [`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
 
+
+#### `barrier.cluster`
+
+- PTX ISA: [`barrier.cluster`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+
+Similar functionality is provided through the builtins
+`__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(),
+__cluster_barrier_wait()`, as well as the `cooperative_groups::cluster_group`
+[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group).
+
+The `.aligned` variants of the instructions are not exposed.
+
+**barrier_cluster**:
+```cuda
+// barrier.cluster.arrive; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive();
+
+// barrier.cluster.wait; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait();
+
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .release }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_release_t);
+
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .relaxed }
+// Marked volatile
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_relaxed_t);
+
+// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait(
+  cuda::ptx::sem_acquire_t);
+```
+
 #### `fence`
 
 - PTX ISA: [`fence`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -2278,6 +2278,142 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+/*
+// barrier.cluster.arrive; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive()
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm volatile (
+      "barrier.cluster.arrive;"
+      :
+      :
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.wait; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline void barrier_cluster_wait()
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm volatile (
+      "barrier.cluster.wait;"
+      :
+      :
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .release }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_release_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(
+  sem_release_t)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm volatile (
+      "barrier.cluster.arrive.release;"
+      :
+      :
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .relaxed }
+// Marked volatile
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_relaxed_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(
+  sem_relaxed_t)
+{
+  // __sem == sem_relaxed (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm volatile (
+      "barrier.cluster.arrive.relaxed;"
+      :
+      :
+      :
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait(
+  cuda::ptx::sem_acquire_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline void barrier_cluster_wait(
+  sem_acquire_t)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm volatile (
+      "barrier.cluster.wait.acquire;"
+      :
+      :
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_barrier_cluster(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
+  ));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.wait;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
+  ));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive.release;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
+  ));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive.relaxed;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
+  ));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.wait.acquire;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
+  ));
+#endif // __cccl_ptx_isa >= 800
+}
+
+int main(int, char**)
+{
+    return 0;
+}