1
+ #ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
2
+ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
3
+
1
4
/*
2
- // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
3
- SM_90
5
+ // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
4
6
// .dst = { .shared::cluster }
5
7
// .src = { .global }
6
8
template <typename=void>
@@ -14,7 +16,7 @@ __device__ static inline void cp_async_bulk(
14
16
*/
15
17
#if __cccl_ptx_isa >= 800
16
18
extern " C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
17
- template <typename = void >
19
+ template <typename = void >
18
20
_CCCL_DEVICE static inline void cp_async_bulk (
19
21
space_cluster_t ,
20
22
space_global_t ,
@@ -25,15 +27,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
25
27
{
26
28
// __space == space_cluster (due to parameter type constraint)
27
29
// __space == space_global (due to parameter type constraint)
28
- NV_IF_ELSE_TARGET (
29
- NV_PROVIDES_SM_90,
30
- (asm (" cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
31
- :
32
- : " r" (__as_ptr_smem (__dstMem)), " l" (__as_ptr_gmem (__srcMem)), " r" (__size), " r" (__as_ptr_smem (__smem_bar))
33
- : " memory" );),
34
- (
35
- // Unsupported architectures will have a linker error with a semi-decent error message
36
- __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();));
30
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
31
+ asm (
32
+ " cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
33
+ :
34
+ : " r" (__as_ptr_smem (__dstMem)),
35
+ " l" (__as_ptr_gmem (__srcMem)),
36
+ " r" (__size),
37
+ " r" (__as_ptr_smem (__smem_bar))
38
+ : " memory"
39
+ );
40
+ ),(
41
+ // Unsupported architectures will have a linker error with a semi-decent error message
42
+ __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
43
+ ));
37
44
}
38
45
#endif // __cccl_ptx_isa >= 800
39
46
@@ -52,7 +59,7 @@ __device__ static inline void cp_async_bulk(
52
59
*/
53
60
#if __cccl_ptx_isa >= 800
54
61
extern " C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
55
- template <typename = void >
62
+ template <typename = void >
56
63
_CCCL_DEVICE static inline void cp_async_bulk (
57
64
space_cluster_t ,
58
65
space_shared_t ,
@@ -63,18 +70,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
63
70
{
64
71
// __space == space_cluster (due to parameter type constraint)
65
72
// __space == space_shared (due to parameter type constraint)
66
- NV_IF_ELSE_TARGET (
67
- NV_PROVIDES_SM_90,
68
- (asm (" cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
69
- :
70
- : " r" (__as_ptr_remote_dsmem (__dstMem)),
71
- " r" (__as_ptr_smem (__srcMem)),
72
- " r" (__size),
73
- " r" (__as_ptr_remote_dsmem (__rdsmem_bar))
74
- : " memory" );),
75
- (
76
- // Unsupported architectures will have a linker error with a semi-decent error message
77
- __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();));
73
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
74
+ asm (
75
+ " cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
76
+ :
77
+ : " r" (__as_ptr_remote_dsmem (__dstMem)),
78
+ " r" (__as_ptr_smem (__srcMem)),
79
+ " r" (__size),
80
+ " r" (__as_ptr_remote_dsmem (__rdsmem_bar))
81
+ : " memory"
82
+ );
83
+ ),(
84
+ // Unsupported architectures will have a linker error with a semi-decent error message
85
+ __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
86
+ ));
78
87
}
79
88
#endif // __cccl_ptx_isa >= 800
80
89
@@ -92,20 +101,30 @@ __device__ static inline void cp_async_bulk(
92
101
*/
93
102
#if __cccl_ptx_isa >= 800
94
103
extern " C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
95
- template <typename = void >
96
- _CCCL_DEVICE static inline void
97
- cp_async_bulk (space_global_t , space_shared_t , void * __dstMem, const void * __srcMem, const _CUDA_VSTD::uint32_t & __size)
104
+ template <typename =void >
105
+ _CCCL_DEVICE static inline void cp_async_bulk (
106
+ space_global_t ,
107
+ space_shared_t ,
108
+ void * __dstMem,
109
+ const void * __srcMem,
110
+ const _CUDA_VSTD::uint32_t & __size)
98
111
{
99
112
// __space == space_global (due to parameter type constraint)
100
113
// __space == space_shared (due to parameter type constraint)
101
- NV_IF_ELSE_TARGET (
102
- NV_PROVIDES_SM_90,
103
- (asm (" cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
104
- :
105
- : " l" (__as_ptr_gmem (__dstMem)), " r" (__as_ptr_smem (__srcMem)), " r" (__size)
106
- : " memory" );),
107
- (
108
- // Unsupported architectures will have a linker error with a semi-decent error message
109
- __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();));
114
+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
115
+ asm (
116
+ " cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
117
+ :
118
+ : " l" (__as_ptr_gmem (__dstMem)),
119
+ " r" (__as_ptr_smem (__srcMem)),
120
+ " r" (__size)
121
+ : " memory"
122
+ );
123
+ ),(
124
+ // Unsupported architectures will have a linker error with a semi-decent error message
125
+ __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__ ();
126
+ ));
110
127
}
111
128
#endif // __cccl_ptx_isa >= 800
129
+
130
+ #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
0 commit comments