Skip to content

Commit

Permalink
Regenerate PTX files and format
Browse files Browse the repository at this point in the history
Overwrites all generated PTX source, test and documentation files and runs `pre-commit run --all-files`
  • Loading branch information
bernhardmgruber committed Nov 22, 2024
1 parent 83d180f commit 2d1d9db
Show file tree
Hide file tree
Showing 58 changed files with 4,646 additions and 4,484 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_

/*
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
Expand All @@ -6,18 +9,20 @@ __device__ static inline void barrier_cluster_arrive();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

Expand All @@ -29,18 +34,20 @@ __device__ static inline void barrier_cluster_wait();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename = void>
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.wait;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

Expand All @@ -54,19 +61,22 @@ __device__ static inline void barrier_cluster_arrive(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_release_t)
{
// __sem == sem_release (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive.release;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.release;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

Expand All @@ -80,19 +90,22 @@ __device__ static inline void barrier_cluster_arrive(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_relaxed_t)
{
// __sem == sem_relaxed (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive.relaxed;"
:
:
:);),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.relaxed;"
:
:
:
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

Expand All @@ -106,18 +119,23 @@ __device__ static inline void barrier_cluster_wait(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait(
sem_acquire_t)
{
// __sem == sem_acquire (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.wait.acquire;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait.acquire;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_

/*
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
SM_90
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -14,7 +16,7 @@ __device__ static inline void cp_async_bulk(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
template <typename=void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
space_global_t,
Expand All @@ -25,15 +27,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
{
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
:
: "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm (
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
:
: "r"(__as_ptr_smem(__dstMem)),
"l"(__as_ptr_gmem(__srcMem)),
"r"(__size),
"r"(__as_ptr_smem(__smem_bar))
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

Expand All @@ -52,7 +59,7 @@ __device__ static inline void cp_async_bulk(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
template <typename=void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
space_shared_t,
Expand All @@ -63,18 +70,20 @@ _CCCL_DEVICE static inline void cp_async_bulk(
{
// __space == space_cluster (due to parameter type constraint)
// __space == space_shared (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
:
: "r"(__as_ptr_remote_dsmem(__dstMem)),
"r"(__as_ptr_smem(__srcMem)),
"r"(__size),
"r"(__as_ptr_remote_dsmem(__rdsmem_bar))
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm (
"cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
:
: "r"(__as_ptr_remote_dsmem(__dstMem)),
"r"(__as_ptr_smem(__srcMem)),
"r"(__size),
"r"(__as_ptr_remote_dsmem(__rdsmem_bar))
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

Expand All @@ -92,20 +101,30 @@ __device__ static inline void cp_async_bulk(
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void
cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
template <typename=void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_global_t,
space_shared_t,
void* __dstMem,
const void* __srcMem,
const _CUDA_VSTD::uint32_t& __size)
{
// __space == space_global (due to parameter type constraint)
// __space == space_shared (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
:
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm (
"cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
:
: "l"(__as_ptr_gmem(__dstMem)),
"r"(__as_ptr_smem(__srcMem)),
"r"(__size)
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_

/*
// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
template <typename=void>
__device__ static inline void cp_async_bulk_commit_group();
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
template <typename = void>
template <typename=void>
_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("cp.async.bulk.commit_group;"
:
:
:);),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"cp.async.bulk.commit_group;"
:
:
:
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
Loading

0 comments on commit 2d1d9db

Please sign in to comment.