diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst index 971f0213cb0..bf0070dfc08 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst @@ -112,7 +112,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a + // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86, SM_100a, SM_101a // .dst = { .global } // .src = { .shared::cta } template diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index fea199e4747..7b6ee3325f6 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -118,14 +118,14 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); @@ -134,14 +134,14 @@ mbarrier.arrive.relaxed.cluster.shared::cta.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); @@ -150,14 +150,14 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr); mbarrier.arrive.relaxed.cluster.shared::cta.b64 @@ -165,14 +165,14 @@ mbarrier.arrive.relaxed.cluster.shared::cta.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr); mbarrier.arrive.relaxed.cluster.shared::cluster.b64 @@ -180,14 +180,14 @@ mbarrier.arrive.relaxed.cluster.shared::cluster.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 - // .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } + // .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); @@ -196,12 +196,12 @@ mbarrier.arrive.relaxed.cluster.shared::cluster.b64 .. code:: cuda // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 - // .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } + // .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 318a7eb5b98..fb171b05b55 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -54,14 +54,14 @@ mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 .. code:: cuda // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& txCount); @@ -70,14 +70,14 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 .. code:: cuda // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 - // .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } + // .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& txCount); @@ -86,13 +86,13 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 .. code:: cuda // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 - // .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } + // .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& txCount); diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h index f376f1b48c3..2f553b98f4c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h @@ -243,7 +243,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( #endif // __cccl_ptx_isa >= 860 /* -// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, +// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86, SM_100a, SM_101a // .dst = { .global } // .src = { .shared::cta } @@ -255,7 +255,7 @@ __device__ static inline void cp_async_bulk_tensor_tile_scatter4( const int32_t (&tensorCoords)[5], const void* srcMem); */ -#if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4( @@ -283,6 +283,6 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4( __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); # endif } -#endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 860 #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index 5f7b23dbb68..5f1a233908f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -223,14 +223,14 @@ _CCCL_DEVICE static inline void mbarrier_arrive( /* // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 -// .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } +// .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); */ @@ -238,15 +238,15 @@ __device__ static inline uint64_t mbarrier_arrive( extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - // __space == space_shared (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) @@ -274,25 +274,25 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( /* // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 -// .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } +// .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr); */ #if __cccl_ptx_isa >= 860 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr) +mbarrier_arrive(sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr) { - // __space == space_shared (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) @@ -320,14 +320,14 @@ mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VS /* // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 -// .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); */ @@ -335,11 +335,11 @@ __device__ static inline void mbarrier_arrive( extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void mbarrier_arrive( - space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) + sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { -// __space == space_cluster (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) // __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" : @@ -354,25 +354,25 @@ _CCCL_DEVICE static inline void mbarrier_arrive( /* // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 -// .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr); */ #if __cccl_ptx_isa >= 860 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void -mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr) +mbarrier_arrive(sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) { -// __space == space_cluster (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) // __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); # else diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index 5cbcd4cb3aa..05bd963c185 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -90,14 +90,14 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( /* // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 -// .space = { .shared::cta } // .sem = { .relaxed } // .scope = { .cta, .cluster } +// .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& txCount); */ @@ -105,15 +105,15 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( - space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) { - // __space == space_shared (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) @@ -141,14 +141,14 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( /* // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 -// .space = { .shared::cluster } // .sem = { .relaxed } // .scope = { .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& txCount); */ @@ -156,11 +156,11 @@ __device__ static inline void mbarrier_arrive_expect_tx( extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( - space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) + sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) { -// __space == space_cluster (due to parameter type constraint) // __sem == sem_relaxed (due to parameter type constraint) // __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" : diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h index 930cfa09125..4a9e6da279b 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h @@ -159,7 +159,7 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr) const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); #endif // __cccl_ptx_isa >= 860 -#if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 NV_IF_TARGET( NV_HAS_FEATURE_SM_100a, ( @@ -176,5 +176,5 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr) static_cast( cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); -#endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h index d32773c118d..1c9484f7104 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h @@ -95,13 +95,13 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr], count; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive)); // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr], count; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 860 @@ -113,12 +113,12 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr]; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*)>( cuda::ptx::mbarrier_arrive)); // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr]; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_shared_t, uint64_t*)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 860 @@ -129,7 +129,7 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr], count; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 860 @@ -140,7 +140,7 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr]; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h index 8ef925662ac..99012490516 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h @@ -52,13 +52,13 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) // mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 state, [addr], txCount; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive_expect_tx)); // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 state, [addr], txCount; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); #endif // __cccl_ptx_isa >= 860 @@ -70,7 +70,7 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [addr], txCount; * fn_ptr++ = reinterpret_cast( static_cast( + cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive_expect_tx));)); #endif // __cccl_ptx_isa >= 860 }