@@ -2278,6 +2278,142 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(
2278
2278
2279
2279
// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
2280
2280
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
2281
+ /*
2282
+ // barrier.cluster.arrive; // PTX ISA 78, SM_90
2283
+ // Marked volatile and as clobbering memory
2284
+ template <typename=void>
2285
+ __device__ static inline void barrier_cluster_arrive();
2286
+ */
2287
+ #if __cccl_ptx_isa >= 780
2288
+ extern " C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2289
+ template <typename =void >
2290
+ _CCCL_DEVICE static inline void barrier_cluster_arrive ()
2291
+ {
2292
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
2293
+ asm volatile (
2294
+ " barrier.cluster.arrive;"
2295
+ :
2296
+ :
2297
+ : " memory"
2298
+ );
2299
+ ),(
2300
+ // Unsupported architectures will have a linker error with a semi-decent error message
2301
+ __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2302
+ ));
2303
+ }
2304
+ #endif // __cccl_ptx_isa >= 780
2305
+
2306
+ /*
2307
+ // barrier.cluster.wait; // PTX ISA 78, SM_90
2308
+ // Marked volatile and as clobbering memory
2309
+ template <typename=void>
2310
+ __device__ static inline void barrier_cluster_wait();
2311
+ */
2312
+ #if __cccl_ptx_isa >= 780
2313
+ extern " C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__ ();
2314
+ template <typename =void >
2315
+ _CCCL_DEVICE static inline void barrier_cluster_wait ()
2316
+ {
2317
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
2318
+ asm volatile (
2319
+ " barrier.cluster.wait;"
2320
+ :
2321
+ :
2322
+ : " memory"
2323
+ );
2324
+ ),(
2325
+ // Unsupported architectures will have a linker error with a semi-decent error message
2326
+ __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__ ();
2327
+ ));
2328
+ }
2329
+ #endif // __cccl_ptx_isa >= 780
2330
+
2331
+ /*
2332
+ // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
2333
+ // .sem = { .release }
2334
+ // Marked volatile and as clobbering memory
2335
+ template <typename=void>
2336
+ __device__ static inline void barrier_cluster_arrive(
2337
+ cuda::ptx::sem_release_t);
2338
+ */
2339
+ #if __cccl_ptx_isa >= 800
2340
+ extern " C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2341
+ template <typename =void >
2342
+ _CCCL_DEVICE static inline void barrier_cluster_arrive (
2343
+ sem_release_t )
2344
+ {
2345
+ // __sem == sem_release (due to parameter type constraint)
2346
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
2347
+ asm volatile (
2348
+ " barrier.cluster.arrive.release;"
2349
+ :
2350
+ :
2351
+ : " memory"
2352
+ );
2353
+ ),(
2354
+ // Unsupported architectures will have a linker error with a semi-decent error message
2355
+ __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2356
+ ));
2357
+ }
2358
+ #endif // __cccl_ptx_isa >= 800
2359
+
2360
+ /*
2361
+ // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
2362
+ // .sem = { .relaxed }
2363
+ // Marked volatile
2364
+ template <typename=void>
2365
+ __device__ static inline void barrier_cluster_arrive(
2366
+ cuda::ptx::sem_relaxed_t);
2367
+ */
2368
+ #if __cccl_ptx_isa >= 800
2369
+ extern " C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2370
+ template <typename =void >
2371
+ _CCCL_DEVICE static inline void barrier_cluster_arrive (
2372
+ sem_relaxed_t )
2373
+ {
2374
+ // __sem == sem_relaxed (due to parameter type constraint)
2375
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
2376
+ asm volatile (
2377
+ " barrier.cluster.arrive.relaxed;"
2378
+ :
2379
+ :
2380
+ :
2381
+ );
2382
+ ),(
2383
+ // Unsupported architectures will have a linker error with a semi-decent error message
2384
+ __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__ ();
2385
+ ));
2386
+ }
2387
+ #endif // __cccl_ptx_isa >= 800
2388
+
2389
+ /*
2390
+ // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
2391
+ // .sem = { .acquire }
2392
+ // Marked volatile and as clobbering memory
2393
+ template <typename=void>
2394
+ __device__ static inline void barrier_cluster_wait(
2395
+ cuda::ptx::sem_acquire_t);
2396
+ */
2397
+ #if __cccl_ptx_isa >= 800
2398
+ extern " C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__ ();
2399
+ template <typename =void >
2400
+ _CCCL_DEVICE static inline void barrier_cluster_wait (
2401
+ sem_acquire_t )
2402
+ {
2403
+ // __sem == sem_acquire (due to parameter type constraint)
2404
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
2405
+ asm volatile (
2406
+ " barrier.cluster.wait.acquire;"
2407
+ :
2408
+ :
2409
+ : " memory"
2410
+ );
2411
+ ),(
2412
+ // Unsupported architectures will have a linker error with a semi-decent error message
2413
+ __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__ ();
2414
+ ));
2415
+ }
2416
+ #endif // __cccl_ptx_isa >= 800
2281
2417
2282
2418
// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
2283
2419
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
0 commit comments