From 3e888d8fd7953d595af016eacd89af610fb624e6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 09:10:00 +0100 Subject: [PATCH] PTX: Remove internal instructions (#3583) * barrier.cluster.aligned: Remove This is not supposed to be exposed in CCCL. * elect.sync: Remove Not ready for inclusion yet. This needs to handle the optional extra output mask as well. * mapa: Remove This has compiler bugs. We should use intrinsics instead. Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 --------- .../ptx/instructions/generated/elect_sync.rst | 11 -- .../ptx/instructions/generated/mapa.rst | 14 -- .../generated/barrier_cluster_aligned.h | 130 ------------------ .../__ptx/instructions/generated/elect_sync.h | 36 ----- .../cuda/__ptx/instructions/generated/mapa.h | 33 ----- .../ptx/generated/barrier_cluster_aligned.h | 61 -------- .../cuda/ptx/generated/elect_sync.h | 26 ---- .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 ---- 9 files changed, 401 deletions(-) delete mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst deleted file mode 100644 index a24093ac7b6..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst +++ /dev/null @@ -1,63 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -barrier.cluster.arrive.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.release.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .release } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.relaxed.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .relaxed } - // .aligned = { .aligned } - // Marked volatile - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.acquire.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .acquire } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst deleted file mode 100644 index bc909c54319..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -elect.sync -^^^^^^^^^^ -.. code:: cuda - - // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 - template - __device__ static inline bool elect_sync( - const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst deleted file mode 100644 index 4ffc70d85d9..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/mapa.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -mapa.shared::cluster.u32 -^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 - // .space = { .shared::cluster } - template - __device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h deleted file mode 100644 index 80fe3796e69..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,130 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ -#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ - -/* -// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .release } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) -{ -// __sem == sem_release (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .relaxed } -// .aligned = { .aligned } -// Marked volatile -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) -{ -// __sem == sem_relaxed (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .acquire } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) -{ -// __sem == sem_acquire (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h deleted file mode 100644 index e8691178f14..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h +++ /dev/null @@ -1,36 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ -#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ - -/* -// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 -template -__device__ static inline bool elect_sync( - const uint32_t& membermask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) -{ -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __is_elected; - asm volatile( - "{\n\t .reg .pred P_OUT; \n\t" - "elect.sync _|P_OUT, %1;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__is_elected) - : "r"(__membermask) - :); - return static_cast(__is_elected); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); - return false; -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h deleted file mode 100644 index f93c8a62157..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_MAPA_H_ -#define _CUDA_PTX_GENERATED_MAPA_H_ - -/* -// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 -// .space = { .shared::cluster } -template -__device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) -{ -// __space == space_cluster (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __dest; - asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); - return __from_ptr_dsmem<_Tp>(__dest); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mapa_is_not_supported_before_SM_90__(); - return __from_ptr_dsmem<_Tp>(0); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h deleted file mode 100644 index 6f5a022dbc8..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,61 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_barrier_cluster_aligned(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.release.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.relaxed.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.acquire.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h deleted file mode 100644 index 298225881d1..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h +++ /dev/null @@ -1,26 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_elect_sync(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // elect.sync _|is_elected, membermask; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h deleted file mode 100644 index 9160be1fe2d..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h +++ /dev/null @@ -1,27 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_mapa(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mapa.shared::cluster.u32 dest, addr, target_cta; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mapa));)); -#endif // __cccl_ptx_isa >= 780 -}