Skip to content

Commit 2f09e3d

Browse files
authored
PTX: Add cuda::ptx:barrier_cluster_{arrive,wait} (#1366)
* Add `cuda::ptx::barrier.cluster.{arrive,wait}` * Add note about .aligned variants
1 parent cbf7da9 commit 2f09e3d

File tree

3 files changed

+258
-2
lines changed

3 files changed

+258
-2
lines changed

libcudacxx/docs/ptx.md

+48-2
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
876876
|-----------------------|-------------------------|
877877
| [`bar, barrier`] | No |
878878
| [`bar.warp.sync`] | No |
879-
| [`barrier.cluster`] | No |
879+
| [`barrier.cluster`] | CTK-FUTURE, CCCL v2.4.0 |
880880
| [`membar`] | No |
881881
| [`fence`] | CTK-FUTURE, CCCL v2.4.0 |
882882
| [`atom`] | No |
@@ -892,7 +892,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
892892

893893
[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
894894
[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
895-
[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
895+
[`barrier.cluster`]: #barriercluster
896896
[`membar`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
897897
[`fence`]: #fence
898898
[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
@@ -906,6 +906,52 @@ __device__ static inline void cp_async_bulk_wait_group_read(
906906
[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
907907
[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
908908

909+
910+
#### `barrier.cluster`
911+
912+
- PTX ISA: [`barrier.cluster`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
913+
914+
Similar functionality is provided through the builtins
915+
`__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(),
916+
__cluster_barrier_wait()`, as well as the `cooperative_groups::cluster_group`
917+
[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group).
918+
919+
The `.aligned` variants of the instructions are not exposed.
920+
921+
**barrier_cluster**:
922+
```cuda
923+
// barrier.cluster.arrive; // PTX ISA 78, SM_90
924+
// Marked volatile and as clobbering memory
925+
template <typename=void>
926+
__device__ static inline void barrier_cluster_arrive();
927+
928+
// barrier.cluster.wait; // PTX ISA 78, SM_90
929+
// Marked volatile and as clobbering memory
930+
template <typename=void>
931+
__device__ static inline void barrier_cluster_wait();
932+
933+
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
934+
// .sem = { .release }
935+
// Marked volatile and as clobbering memory
936+
template <typename=void>
937+
__device__ static inline void barrier_cluster_arrive(
938+
cuda::ptx::sem_release_t);
939+
940+
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
941+
// .sem = { .relaxed }
942+
// Marked volatile
943+
template <typename=void>
944+
__device__ static inline void barrier_cluster_arrive(
945+
cuda::ptx::sem_relaxed_t);
946+
947+
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
948+
// .sem = { .acquire }
949+
// Marked volatile and as clobbering memory
950+
template <typename=void>
951+
__device__ static inline void barrier_cluster_wait(
952+
cuda::ptx::sem_acquire_t);
953+
```
954+
909955
#### `fence`
910956

911957
- PTX ISA: [`fence`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence)

libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h

+136
Original file line numberDiff line numberDiff line change
@@ -2278,6 +2278,142 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(
22782278

22792279
// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
22802280
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
2281+
/*
2282+
// barrier.cluster.arrive; // PTX ISA 78, SM_90
2283+
// Marked volatile and as clobbering memory
2284+
template <typename=void>
2285+
__device__ static inline void barrier_cluster_arrive();
2286+
*/
2287+
#if __cccl_ptx_isa >= 780
2288+
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2289+
template <typename=void>
2290+
_CCCL_DEVICE static inline void barrier_cluster_arrive()
2291+
{
2292+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
2293+
asm volatile (
2294+
"barrier.cluster.arrive;"
2295+
:
2296+
:
2297+
: "memory"
2298+
);
2299+
),(
2300+
// Unsupported architectures will have a linker error with a semi-decent error message
2301+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2302+
));
2303+
}
2304+
#endif // __cccl_ptx_isa >= 780
2305+
2306+
/*
2307+
// barrier.cluster.wait; // PTX ISA 78, SM_90
2308+
// Marked volatile and as clobbering memory
2309+
template <typename=void>
2310+
__device__ static inline void barrier_cluster_wait();
2311+
*/
2312+
#if __cccl_ptx_isa >= 780
2313+
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
2314+
template <typename=void>
2315+
_CCCL_DEVICE static inline void barrier_cluster_wait()
2316+
{
2317+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
2318+
asm volatile (
2319+
"barrier.cluster.wait;"
2320+
:
2321+
:
2322+
: "memory"
2323+
);
2324+
),(
2325+
// Unsupported architectures will have a linker error with a semi-decent error message
2326+
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
2327+
));
2328+
}
2329+
#endif // __cccl_ptx_isa >= 780
2330+
2331+
/*
2332+
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
2333+
// .sem = { .release }
2334+
// Marked volatile and as clobbering memory
2335+
template <typename=void>
2336+
__device__ static inline void barrier_cluster_arrive(
2337+
cuda::ptx::sem_release_t);
2338+
*/
2339+
#if __cccl_ptx_isa >= 800
2340+
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2341+
template <typename=void>
2342+
_CCCL_DEVICE static inline void barrier_cluster_arrive(
2343+
sem_release_t)
2344+
{
2345+
// __sem == sem_release (due to parameter type constraint)
2346+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
2347+
asm volatile (
2348+
"barrier.cluster.arrive.release;"
2349+
:
2350+
:
2351+
: "memory"
2352+
);
2353+
),(
2354+
// Unsupported architectures will have a linker error with a semi-decent error message
2355+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2356+
));
2357+
}
2358+
#endif // __cccl_ptx_isa >= 800
2359+
2360+
/*
2361+
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
2362+
// .sem = { .relaxed }
2363+
// Marked volatile
2364+
template <typename=void>
2365+
__device__ static inline void barrier_cluster_arrive(
2366+
cuda::ptx::sem_relaxed_t);
2367+
*/
2368+
#if __cccl_ptx_isa >= 800
2369+
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2370+
template <typename=void>
2371+
_CCCL_DEVICE static inline void barrier_cluster_arrive(
2372+
sem_relaxed_t)
2373+
{
2374+
// __sem == sem_relaxed (due to parameter type constraint)
2375+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
2376+
asm volatile (
2377+
"barrier.cluster.arrive.relaxed;"
2378+
:
2379+
:
2380+
:
2381+
);
2382+
),(
2383+
// Unsupported architectures will have a linker error with a semi-decent error message
2384+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
2385+
));
2386+
}
2387+
#endif // __cccl_ptx_isa >= 800
2388+
2389+
/*
2390+
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
2391+
// .sem = { .acquire }
2392+
// Marked volatile and as clobbering memory
2393+
template <typename=void>
2394+
__device__ static inline void barrier_cluster_wait(
2395+
cuda::ptx::sem_acquire_t);
2396+
*/
2397+
#if __cccl_ptx_isa >= 800
2398+
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
2399+
template <typename=void>
2400+
_CCCL_DEVICE static inline void barrier_cluster_wait(
2401+
sem_acquire_t)
2402+
{
2403+
// __sem == sem_acquire (due to parameter type constraint)
2404+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
2405+
asm volatile (
2406+
"barrier.cluster.wait.acquire;"
2407+
:
2408+
:
2409+
: "memory"
2410+
);
2411+
),(
2412+
// Unsupported architectures will have a linker error with a semi-decent error message
2413+
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
2414+
));
2415+
}
2416+
#endif // __cccl_ptx_isa >= 800
22812417

22822418
// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
22832419
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
// UNSUPPORTED: libcpp-has-no-threads
11+
12+
// <cuda/ptx>
13+
14+
#include <cuda/ptx>
15+
#include <cuda/std/utility>
16+
17+
/*
18+
* We use a special strategy to force the generation of the PTX. This is mainly
19+
* a fight against dead-code-elimination in the NVVM layer.
20+
*
21+
* The reason we need this strategy is because certain older versions of ptxas
22+
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
23+
* to force the instantiation and compilation to PTX of all the overloads of the
24+
* PTX wrapping functions.
25+
*
26+
* We do this by writing a function pointer of each overload to the kernel
27+
* parameter `fn_ptr`.
28+
*
29+
* Because `fn_ptr` is possibly visible outside this translation unit, the
30+
* compiler must compile all the functions which are stored.
31+
*
32+
*/
33+
34+
__global__ void test_barrier_cluster(void ** fn_ptr) {
35+
#if __cccl_ptx_isa >= 780
36+
NV_IF_TARGET(NV_PROVIDES_SM_90, (
37+
// barrier.cluster.arrive;
38+
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
39+
));
40+
#endif // __cccl_ptx_isa >= 780
41+
42+
#if __cccl_ptx_isa >= 780
43+
NV_IF_TARGET(NV_PROVIDES_SM_90, (
44+
// barrier.cluster.wait;
45+
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
46+
));
47+
#endif // __cccl_ptx_isa >= 780
48+
49+
#if __cccl_ptx_isa >= 800
50+
NV_IF_TARGET(NV_PROVIDES_SM_90, (
51+
// barrier.cluster.arrive.release;
52+
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
53+
));
54+
#endif // __cccl_ptx_isa >= 800
55+
56+
#if __cccl_ptx_isa >= 800
57+
NV_IF_TARGET(NV_PROVIDES_SM_90, (
58+
// barrier.cluster.arrive.relaxed;
59+
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
60+
));
61+
#endif // __cccl_ptx_isa >= 800
62+
63+
#if __cccl_ptx_isa >= 800
64+
NV_IF_TARGET(NV_PROVIDES_SM_90, (
65+
// barrier.cluster.wait.acquire;
66+
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
67+
));
68+
#endif // __cccl_ptx_isa >= 800
69+
}
70+
71+
int main(int, char**)
72+
{
73+
return 0;
74+
}

0 commit comments

Comments
 (0)