From 3e888d8fd7953d595af016eacd89af610fb624e6 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 09:10:00 +0100
Subject: [PATCH] PTX: Remove internal instructions (#3583)

* barrier.cluster.aligned: Remove
This is not supposed to be exposed in CCCL.

* elect.sync: Remove
Not ready for inclusion yet. This needs to handle the optional extra
output mask as well.

* mapa: Remove
This has compiler bugs. We should use intrinsics instead.

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 .../generated/barrier_cluster_aligned.rst     |  63 ---------
 .../ptx/instructions/generated/elect_sync.rst |  11 --
 .../ptx/instructions/generated/mapa.rst       |  14 --
 .../generated/barrier_cluster_aligned.h       | 130 ------------------
 .../__ptx/instructions/generated/elect_sync.h |  36 -----
 .../cuda/__ptx/instructions/generated/mapa.h  |  33 -----
 .../ptx/generated/barrier_cluster_aligned.h   |  61 --------
 .../cuda/ptx/generated/elect_sync.h           |  26 ----
 .../test/libcudacxx/cuda/ptx/generated/mapa.h |  27 ----
 9 files changed, 401 deletions(-)
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h
diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
deleted file mode 100644
index a24093ac7b6..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
+++ /dev/null
@@ -1,63 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-barrier.cluster.arrive.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.wait.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_wait(
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.arrive.release.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.arrive.relaxed.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .relaxed }
-   // .aligned   = { .aligned }
-   // Marked volatile
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::sem_relaxed_t,
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.wait.acquire.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::dot_aligned_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
deleted file mode 100644
index bc909c54319..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-elect.sync
-^^^^^^^^^^
-.. code:: cuda
-
-   // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
-   template <typename = void>
-   __device__ static inline bool elect_sync(
-     const uint32_t& membermask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst
deleted file mode 100644
index 4ffc70d85d9..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/mapa.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-mapa.shared::cluster.u32
-^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mapa.space.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
-   // .space     = { .shared::cluster }
-   template <typename Tp>
-   __device__ static inline Tp* mapa(
-     cuda::ptx::space_cluster_t,
-     const Tp* addr,
-     uint32_t target_cta);
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
deleted file mode 100644
index 80fe3796e69..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
-#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
-
-/*
-// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t)
-{
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t)
-{
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.wait.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .release }
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t)
-{
-// __sem == sem_release (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .relaxed }
-// .aligned   = { .aligned }
-// Marked volatile
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_relaxed_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t)
-{
-// __sem == sem_relaxed (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t)
-{
-// __sem == sem_acquire (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
deleted file mode 100644
index e8691178f14..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_
-#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_
-
-/*
-// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
-template <typename = void>
-__device__ static inline bool elect_sync(
-  const uint32_t& membermask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask)
-{
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CUDA_VSTD::uint32_t __is_elected;
-  asm volatile(
-    "{\n\t .reg .pred P_OUT; \n\t"
-    "elect.sync _|P_OUT, %1;\n\t"
-    "selp.b32 %0, 1, 0, P_OUT; \n"
-    "}"
-    : "=r"(__is_elected)
-    : "r"(__membermask)
-    :);
-  return static_cast<bool>(__is_elected);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
-  return false;
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
deleted file mode 100644
index f93c8a62157..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_MAPA_H_
-#define _CUDA_PTX_GENERATED_MAPA_H_
-
-/*
-// mapa.space.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
-// .space     = { .shared::cluster }
-template <typename Tp>
-__device__ static inline Tp* mapa(
-  cuda::ptx::space_cluster_t,
-  const Tp* addr,
-  uint32_t target_cta);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__();
-template <typename _Tp>
-_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta)
-{
-// __space == space_cluster (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CUDA_VSTD::uint32_t __dest;
-  asm("mapa.shared::cluster.u32  %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :);
-  return __from_ptr_dsmem<_Tp>(__dest);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_mapa_is_not_supported_before_SM_90__();
-  return __from_ptr_dsmem<_Tp>(0);
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-#endif // _CUDA_PTX_GENERATED_MAPA_H_
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
deleted file mode 100644
index 6f5a022dbc8..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_barrier_cluster_aligned(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.aligned;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::dot_aligned_t)>(cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait.aligned;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::dot_aligned_t)>(cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.arrive.release.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.arrive.relaxed.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.wait.acquire.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
deleted file mode 100644
index 298225881d1..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_elect_sync(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // elect.sync _|is_elected, membermask;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)(const uint32_t&)>(cuda::ptx::elect_sync));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h
deleted file mode 100644
index 9160be1fe2d..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_mapa(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mapa.shared::cluster.u32  dest, addr, target_cta;
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<uint64_t* (*) (cuda::ptx::space_cluster_t, const uint64_t*, uint32_t)>(cuda::ptx::mapa));));
-#endif // __cccl_ptx_isa >= 780
-}