Skip to content

Commit 2d1d9db

Browse files
Regenerate PTX files and format
Overwrites all generated PTX source, test and documentation files and runs `pre-commit run --all-files`
1 parent 83d180f commit 2d1d9db

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+4646
-4484
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
2+
#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
3+
14
/*
25
// barrier.cluster.arrive; // PTX ISA 78, SM_90
36
// Marked volatile and as clobbering memory
@@ -6,18 +9,20 @@ __device__ static inline void barrier_cluster_arrive();
69
*/
710
#if __cccl_ptx_isa >= 780
811
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
9-
template <typename = void>
12+
template <typename=void>
1013
_CCCL_DEVICE static inline void barrier_cluster_arrive()
1114
{
12-
NV_IF_ELSE_TARGET(
13-
NV_PROVIDES_SM_90,
14-
(asm volatile("barrier.cluster.arrive;"
15-
:
16-
:
17-
: "memory");),
18-
(
19-
// Unsupported architectures will have a linker error with a semi-decent error message
20-
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
15+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
16+
asm volatile (
17+
"barrier.cluster.arrive;"
18+
:
19+
:
20+
: "memory"
21+
);
22+
),(
23+
// Unsupported architectures will have a linker error with a semi-decent error message
24+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
25+
));
2126
}
2227
#endif // __cccl_ptx_isa >= 780
2328

@@ -29,18 +34,20 @@ __device__ static inline void barrier_cluster_wait();
2934
*/
3035
#if __cccl_ptx_isa >= 780
3136
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
32-
template <typename = void>
37+
template <typename=void>
3338
_CCCL_DEVICE static inline void barrier_cluster_wait()
3439
{
35-
NV_IF_ELSE_TARGET(
36-
NV_PROVIDES_SM_90,
37-
(asm volatile("barrier.cluster.wait;"
38-
:
39-
:
40-
: "memory");),
41-
(
42-
// Unsupported architectures will have a linker error with a semi-decent error message
43-
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
40+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
41+
asm volatile (
42+
"barrier.cluster.wait;"
43+
:
44+
:
45+
: "memory"
46+
);
47+
),(
48+
// Unsupported architectures will have a linker error with a semi-decent error message
49+
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
50+
));
4451
}
4552
#endif // __cccl_ptx_isa >= 780
4653

@@ -54,19 +61,22 @@ __device__ static inline void barrier_cluster_arrive(
5461
*/
5562
#if __cccl_ptx_isa >= 800
5663
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
57-
template <typename = void>
58-
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
64+
template <typename=void>
65+
_CCCL_DEVICE static inline void barrier_cluster_arrive(
66+
sem_release_t)
5967
{
6068
// __sem == sem_release (due to parameter type constraint)
61-
NV_IF_ELSE_TARGET(
62-
NV_PROVIDES_SM_90,
63-
(asm volatile("barrier.cluster.arrive.release;"
64-
:
65-
:
66-
: "memory");),
67-
(
68-
// Unsupported architectures will have a linker error with a semi-decent error message
69-
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
69+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
70+
asm volatile (
71+
"barrier.cluster.arrive.release;"
72+
:
73+
:
74+
: "memory"
75+
);
76+
),(
77+
// Unsupported architectures will have a linker error with a semi-decent error message
78+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
79+
));
7080
}
7181
#endif // __cccl_ptx_isa >= 800
7282

@@ -80,19 +90,22 @@ __device__ static inline void barrier_cluster_arrive(
8090
*/
8191
#if __cccl_ptx_isa >= 800
8292
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
83-
template <typename = void>
84-
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
93+
template <typename=void>
94+
_CCCL_DEVICE static inline void barrier_cluster_arrive(
95+
sem_relaxed_t)
8596
{
8697
// __sem == sem_relaxed (due to parameter type constraint)
87-
NV_IF_ELSE_TARGET(
88-
NV_PROVIDES_SM_90,
89-
(asm volatile("barrier.cluster.arrive.relaxed;"
90-
:
91-
:
92-
:);),
93-
(
94-
// Unsupported architectures will have a linker error with a semi-decent error message
95-
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
98+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
99+
asm volatile (
100+
"barrier.cluster.arrive.relaxed;"
101+
:
102+
:
103+
:
104+
);
105+
),(
106+
// Unsupported architectures will have a linker error with a semi-decent error message
107+
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
108+
));
96109
}
97110
#endif // __cccl_ptx_isa >= 800
98111

@@ -106,18 +119,23 @@ __device__ static inline void barrier_cluster_wait(
106119
*/
107120
#if __cccl_ptx_isa >= 800
108121
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
109-
template <typename = void>
110-
_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
122+
template <typename=void>
123+
_CCCL_DEVICE static inline void barrier_cluster_wait(
124+
sem_acquire_t)
111125
{
112126
// __sem == sem_acquire (due to parameter type constraint)
113-
NV_IF_ELSE_TARGET(
114-
NV_PROVIDES_SM_90,
115-
(asm volatile("barrier.cluster.wait.acquire;"
116-
:
117-
:
118-
: "memory");),
119-
(
120-
// Unsupported architectures will have a linker error with a semi-decent error message
121-
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
127+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
128+
asm volatile (
129+
"barrier.cluster.wait.acquire;"
130+
:
131+
:
132+
: "memory"
133+
);
134+
),(
135+
// Unsupported architectures will have a linker error with a semi-decent error message
136+
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
137+
));
122138
}
123139
#endif // __cccl_ptx_isa >= 800
140+
141+
#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
2+
#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
3+
14
/*
2-
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
3-
SM_90
5+
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
46
// .dst = { .shared::cluster }
57
// .src = { .global }
68
template <typename=void>
@@ -14,7 +16,7 @@ __device__ static inline void cp_async_bulk(
1416
*/
1517
#if __cccl_ptx_isa >= 800
1618
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
17-
template <typename = void>
19+
template <typename=void>
1820
_CCCL_DEVICE static inline void cp_async_bulk(
1921
space_cluster_t,
2022
space_global_t,
@@ -25,15 +27,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
2527
{
2628
// __space == space_cluster (due to parameter type constraint)
2729
// __space == space_global (due to parameter type constraint)
28-
NV_IF_ELSE_TARGET(
29-
NV_PROVIDES_SM_90,
30-
(asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
31-
:
32-
: "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
33-
: "memory");),
34-
(
35-
// Unsupported architectures will have a linker error with a semi-decent error message
36-
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
30+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
31+
asm (
32+
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
33+
:
34+
: "r"(__as_ptr_smem(__dstMem)),
35+
"l"(__as_ptr_gmem(__srcMem)),
36+
"r"(__size),
37+
"r"(__as_ptr_smem(__smem_bar))
38+
: "memory"
39+
);
40+
),(
41+
// Unsupported architectures will have a linker error with a semi-decent error message
42+
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
43+
));
3744
}
3845
#endif // __cccl_ptx_isa >= 800
3946

@@ -52,7 +59,7 @@ __device__ static inline void cp_async_bulk(
5259
*/
5360
#if __cccl_ptx_isa >= 800
5461
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
55-
template <typename = void>
62+
template <typename=void>
5663
_CCCL_DEVICE static inline void cp_async_bulk(
5764
space_cluster_t,
5865
space_shared_t,
@@ -63,18 +70,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
6370
{
6471
// __space == space_cluster (due to parameter type constraint)
6572
// __space == space_shared (due to parameter type constraint)
66-
NV_IF_ELSE_TARGET(
67-
NV_PROVIDES_SM_90,
68-
(asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
69-
:
70-
: "r"(__as_ptr_remote_dsmem(__dstMem)),
71-
"r"(__as_ptr_smem(__srcMem)),
72-
"r"(__size),
73-
"r"(__as_ptr_remote_dsmem(__rdsmem_bar))
74-
: "memory");),
75-
(
76-
// Unsupported architectures will have a linker error with a semi-decent error message
77-
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
73+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
74+
asm (
75+
"cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
76+
:
77+
: "r"(__as_ptr_remote_dsmem(__dstMem)),
78+
"r"(__as_ptr_smem(__srcMem)),
79+
"r"(__size),
80+
"r"(__as_ptr_remote_dsmem(__rdsmem_bar))
81+
: "memory"
82+
);
83+
),(
84+
// Unsupported architectures will have a linker error with a semi-decent error message
85+
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
86+
));
7887
}
7988
#endif // __cccl_ptx_isa >= 800
8089

@@ -92,20 +101,30 @@ __device__ static inline void cp_async_bulk(
92101
*/
93102
#if __cccl_ptx_isa >= 800
94103
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
95-
template <typename = void>
96-
_CCCL_DEVICE static inline void
97-
cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
104+
template <typename=void>
105+
_CCCL_DEVICE static inline void cp_async_bulk(
106+
space_global_t,
107+
space_shared_t,
108+
void* __dstMem,
109+
const void* __srcMem,
110+
const _CUDA_VSTD::uint32_t& __size)
98111
{
99112
// __space == space_global (due to parameter type constraint)
100113
// __space == space_shared (due to parameter type constraint)
101-
NV_IF_ELSE_TARGET(
102-
NV_PROVIDES_SM_90,
103-
(asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
104-
:
105-
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
106-
: "memory");),
107-
(
108-
// Unsupported architectures will have a linker error with a semi-decent error message
109-
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
114+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
115+
asm (
116+
"cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
117+
:
118+
: "l"(__as_ptr_gmem(__dstMem)),
119+
"r"(__as_ptr_smem(__srcMem)),
120+
"r"(__size)
121+
: "memory"
122+
);
123+
),(
124+
// Unsupported architectures will have a linker error with a semi-decent error message
125+
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
126+
));
110127
}
111128
#endif // __cccl_ptx_isa >= 800
129+
130+
#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,28 @@
1+
#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
2+
#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
3+
14
/*
25
// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
36
template <typename=void>
47
__device__ static inline void cp_async_bulk_commit_group();
58
*/
69
#if __cccl_ptx_isa >= 800
710
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
8-
template <typename = void>
11+
template <typename=void>
912
_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
1013
{
11-
NV_IF_ELSE_TARGET(
12-
NV_PROVIDES_SM_90,
13-
(asm volatile("cp.async.bulk.commit_group;"
14-
:
15-
:
16-
:);),
17-
(
18-
// Unsupported architectures will have a linker error with a semi-decent error message
19-
__cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
14+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
15+
asm volatile (
16+
"cp.async.bulk.commit_group;"
17+
:
18+
:
19+
:
20+
);
21+
),(
22+
// Unsupported architectures will have a linker error with a semi-decent error message
23+
__cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
24+
));
2025
}
2126
#endif // __cccl_ptx_isa >= 800
27+
28+
#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_

0 commit comments

Comments
 (0)